In [7]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import f1_score, precision_score, recall_score

In [8]:
df = pd.read_csv('../data/final_data.csv')
df.head()

Unnamed: 0,cleaned_review,sentiment
0,one of the other reviewer have mention that af...,positive
1,a wonderful little production the filming t...,positive
2,think this be a wonderful way to spend time o...,positive
3,basically there s a family where a little boy ...,negative
4,Petter Matteis Love in the Time of Money be a ...,positive


In [9]:
X = df['cleaned_review']
y = df['sentiment'].replace({'positive':1, 'negative':0})

In [13]:
tfidf = TfidfVectorizer(dtype=np.float32)

In [14]:
X = tfidf.fit_transform(X)

X = X.todense()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [8]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(40000, 155043) (10000, 155043) (40000,) (10000,)


In [9]:
models = [
    GaussianNB(),
    LogisticRegression(),
    RandomForestClassifier(),
    DecisionTreeClassifier(),
    GradientBoostingClassifier()
]


def display_metrics(true, pred):

    f1 = f1_score(y_true=true, y_pred=pred)
    precision = precision_score(y_true=true, y_pred=pred)
    recall = recall_score(y_true=true, y_pred=pred)

    print(f'F1: {f1}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')

In [10]:
trained_models = dict()

for model in models:
    print(f'Training -> {model.__class__.__name__}')
    trained_models[model.__class__.__name__] = model.fit(X_train, y_train)
    preds = trained_models[model.__class__.__name__].predict(X_test)
    display_metrics(true=y_test, pred=preds)
    print('-' * 10)

Training -> GaussianNB


TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.