In [1]:
import pandas

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
import numpy as np

In [2]:
def getData():
    with open("sentiment labelled sentences/imdb_labelled.txt", "r") as f:
        data = f.read().strip()

    data = data.split("\n")

    results = []
    for d in data:
        sentence, label = d.split("\t")
        results.append({
            "sentence": sentence,
            "label": label,
        })
    return pandas.DataFrame(results)

In [3]:
df = getData()

In [4]:
df.sample(10)

Unnamed: 0,label,sentence
144,1,This is definitely a cult classic well worth v...
292,1,An AMAZING finale to possibly the BEST trilogy...
302,0,I literally vomited while watching this film.
680,1,It was also the right balance of war and love.
116,0,And generally the lines and plot is weaker tha...
252,0,The soundtrack sucked.
68,0,This movie totally grates on my nerves.
548,1,The original Body and Soul (1947) is a masterp...
71,0,Often the dialogue doesn't really follow from ...
808,1,When a song could explain the emotions of the ...


In [5]:
df["sentence"] = df["sentence"].map(lambda x: x.lower())

In [6]:
kf = KFold(n_splits=10, shuffle=True)

In [7]:
for train_index, test_index in kf.split(df):
    df_train = df.iloc[train_index].copy()
    df_test = df.iloc[test_index].copy()
    
    label_encoder = LabelEncoder()
    vectorizer = CountVectorizer(binary=False, ngram_range=(1, 2), stop_words=None)
    classifier = LogisticRegression(C=10)
    
    vectorizer.fit(df_train["sentence"])
    label_encoder.fit(df_train["label"])
    X_train = vectorizer.transform(df_train["sentence"])
    y_train = label_encoder.transform(df_train["label"])
    
    classifier.fit(X_train, y_train)
    
    X_test = vectorizer.transform(df_test["sentence"])
    df.loc[test_index, "predict"] = label_encoder.inverse_transform(np.argmax(classifier.predict_proba(X_test), axis=-1))



In [8]:
len(df[df["label"] == df["predict"]]) / len(df)

0.77