In [14]:
#importing libraries here: 
import pandas as pd
import numpy as np 
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier

In [3]:
# Read training and testing data from tsv file
train_data = pd.read_csv("Data/kaggle_train.tsv", sep='\t')
test_data = pd.read_csv("Data/kaggle_test.tsv", sep='\t')

In [4]:
X = train_data['Phrase']
y = train_data['Sentiment']

X_test = test_data['Phrase']

# Split train_data into testing and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.8, random_state=3000)

### MultinomialNB

In [5]:
# Range of min_df values from 0-10
min_df = range(11)
# Range of alpha values from 0-1 by increments of 0.1
alphas = np.arange(0, 1, 0.1)
alphas = np.delete(alphas, 0)

# Keep track of the best min_df, alpha, train score, and validation score
best_md = None
best_alpha = None
best_train_score = 0
best_val_score = 0

# Iterate through min_df and alpha values to find the best combination that produces the highest validation score
for m in min_df:
    #create the vocabulary based on the training data
    vect = TfidfVectorizer(min_df=m, ngram_range=(1,2)).fit(X_train)

    #encode the words in X_train and X_val based on the vocabulary
    X_train_vectorized = vect.transform(X_train)
    X_val_vectorized = vect.transform(X_val)
    
    for a in alphas:
        # Train a MultinomialNB model and get the validation score
        mnb = MultinomialNB(alpha = a).fit(X_train_vectorized, y_train)
        score = mnb.score(X_val_vectorized, y_val)
        
        # If this model has a higher val score, set the best parameters/scores to the current parameters/scores
        if score > best_val_score:
            best_md = m
            best_alpha = a
            best_val_score = score
            best_train_score = mnb.score(X_train_vectorized, y_train)
            
print(f"Best min_df: {best_md}")
print(f"Best alpha: {best_alpha}")
print(f"Best train score: {best_train_score}")
print(f"Best validation score: {best_val_score}")

Best min_df: 0
Best alpha: 0.4
Best train score: 0.7463956170703575
Best validation score: 0.6323529411764706


### Logistic Regression

In [13]:
# Logistic Regression

# Range of min_df values from 0-10
min_df = range(11)

for m in min_df:
    #create the vocabulary based on the training data
    vect = TfidfVectorizer(min_df=m, ngram_range=(1,2)).fit(X_train)

    #encode the words in X_train and X_val based on the vocabulary
    X_train_vectorized = vect.transform(X_train)
    X_val_vectorized = vect.transform(X_val)

    lr = LogisticRegression(n_jobs=-1)
    lr.fit(X_train_vectorized, y_train)
    train_score = lr.score(X_train_vectorized, y_train)
    val_score = lr.score(X_val_vectorized, y_val)

    print(m)
    print(f"train score: {train_score}")
    print(f"validation score: {val_score}")

0
train score: 0.7260508778674869
validation score: 0.6438228886325772
1
train score: 0.7260508778674869
validation score: 0.6438228886325772
2
train score: 0.7343649878251954
validation score: 0.6501025246699987
3
train score: 0.7294710367807253
validation score: 0.6456491093169294
4
train score: 0.7319220171728822
validation score: 0.6486928104575164
5
train score: 0.7240164039471998
validation score: 0.6461937716262975
6
train score: 0.7167916186082276
validation score: 0.644239395104447
7
train score: 0.7151976803793413
validation score: 0.6457772651544278
8
train score: 0.710239651416122
validation score: 0.6461937716262975
9
train score: 0.7051454568755606
validation score: 0.645200563885685
10
train score: 0.7040881712161989
validation score: 0.64439958990132


### Decision Tree 

In [None]:
# Range of min_df values from 0-10
min_df = range(11)

for m in min_df:
    #create the vocabulary based on the training data
    vect = TfidfVectorizer(min_df=m, ngram_range=(1,2)).fit(X_train)

    #encode the words in X_train and X_val based on the vocabulary
    X_train_vectorized = vect.transform(X_train)
    X_val_vectorized = vect.transform(X_val)

    dtc = DecisionTreeClassifier()
    dtc.fit(X_train_vectorized, y_train)
    train_score = dtc.score(X_train_vectorized, y_train)
    val_score = dtc.score(X_val_vectorized, y_val)

    print(m)
    print(f"train score: {train_score}")
    print(f"validation score: {val_score}")