In [1]:
import pandas

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
import numpy as np

In [2]:
def getData():
    with open("sentiment labelled sentences/imdb_labelled.txt", "r") as f:
        data = f.read().strip()

    data = data.split("\n")

    results = []
    for d in data:
        sentence, label = d.split("\t")
        results.append({
            "sentence": sentence,
            "label": label,
        })
    return pandas.DataFrame(results)

In [3]:
df = getData()

In [4]:
df["label"].value_counts()

1    500
0    500
Name: label, dtype: int64

In [5]:
df.sample(10)

Unnamed: 0,label,sentence
441,0,But the convoluted plot just didn't convince m...
899,1,Director Neil LaBute uses brutal violence to s...
625,0,"Regardless, the film fails on most levels."
407,0,I couldn't take them seriously.
748,1,I'm a big fan of this series mostly due to Ann...
231,0,"I mean, in a realistic world, she would NEVER ..."
866,1,"Highly recommended for all ages, although the ..."
354,1,"The writers were ""smack on"" and I think the be..."
288,0,Really bad.
961,1,Both actors truly understand and become their ...


In [6]:
df["sentence"] = df["sentence"].map(lambda x: x.lower())

In [7]:
# 取 0~900, 900~1000 當做 train / test
df_train = df[:900].copy()
df_test = df[900:].copy()

In [8]:
label_encoder = LabelEncoder()
vectorizer = CountVectorizer(binary=False, ngram_range=(1, 2), stop_words=None)

In [9]:
vectorizer.fit(df_train["sentence"])
label_encoder.fit(df_train["label"])

LabelEncoder()

In [10]:
X_train = vectorizer.transform(df_train["sentence"])
y_train = label_encoder.transform(df_train["label"])

In [11]:
X_train, y_train

(<900x11531 sparse matrix of type '<class 'numpy.int64'>'
 	with 22829 stored elements in Compressed Sparse Row format>,
 array([0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
        1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 

In [12]:
classifier = LogisticRegression(C=10)

classifier.fit(X_train, y_train)



LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [13]:
X_test = vectorizer.transform(df_test["sentence"])

In [14]:
predict_proba = classifier.predict_proba(X_test)

In [15]:
predict_label = np.argmax(predict_proba, axis=-1)

In [16]:
df_test["predict"] = label_encoder.inverse_transform(predict_label)

In [17]:
len(df_test[df_test["label"] == df_test["predict"]]) / len(df_test)

0.81