In [29]:
import pandas as pd
import re

#import nltk
#nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

df = pd.read_csv('data/movie-reviews.csv')[:1000]
df.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [27]:
my_stopwords = stopwords.words("english")
my_stopwords.remove("not")
my_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [34]:
ps = PorterStemmer()  # stemming (alternatywnie można użyć: 'lemmatization')

def preprocess_document(text):
    
    text = text.lower()
    text = re.sub(r"[^A-Za-z1-9]", " ", text)
    text = text.split()
    text = [ps.stem(word) for word in text if word not in my_stopwords]
    text = ' '.join(text)
    
    return text

In [35]:
df["text"].apply(preprocess_document)

0      grew b 1965 watch love thunderbird mate school...
1      put movi dvd player sat coke chip expect hope ...
2      peopl not know particular time past like feel ...
3      even though great interest biblic movi bore de...
4      im die hard dad armi fan noth ever chang got t...
                             ...                        
995    oh bad funni way one could explain someth like...
996    believ terribl movi actual made worst actor co...
997    even though slightli older recommend age group...
998    read web site bett davi one find instanc autho...
999    regret seen sinc rate imdb rel high must also ...
Name: text, Length: 1000, dtype: object

In [36]:
df["text"] = df["text"].apply(preprocess_document)
df

Unnamed: 0,text,label
0,grew b 1965 watch love thunderbird mate school...,0
1,put movi dvd player sat coke chip expect hope ...,0
2,peopl not know particular time past like feel ...,0
3,even though great interest biblic movi bore de...,0
4,im die hard dad armi fan noth ever chang got t...,1
...,...,...
995,oh bad funni way one could explain someth like...,0
996,believ terribl movi actual made worst actor co...,0
997,even though slightli older recommend age group...,1
998,read web site bett davi one find instanc autho...,1


In [41]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=2000)
data = cv.fit_transform(df["text"]).toarray()

In [42]:
cv.get_feature_names_out()[:20]
# len(cv.get_feature_names_out())

array(['12', '14', '15', '195', '196', '197', '198', '25', 'abandon',
       'abil', 'abl', 'absolut', 'absurd', 'abus', 'academi', 'accent',
       'accept', 'accid', 'accomplish', 'accord'], dtype=object)

In [45]:
print(data[0].sum())
print(data[1].sum())

62
142


In [48]:
X = pd.DataFrame(data, columns=cv.get_feature_names_out())
X

Unnamed: 0,12,14,15,195,196,197,198,25,abandon,abil,...,ye,yeah,year,yet,york,young,younger,youth,zero,zombi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
998,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
y = df["label"]
y

0      0
1      0
2      0
3      0
4      1
      ..
995    0
996    0
997    1
998    1
999    0
Name: label, Length: 1000, dtype: int64

In [51]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [53]:
y.value_counts()  # zbalansowane dane

0    522
1    478
Name: label, dtype: int64

## Regresja logistyczna

In [54]:
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression

model_lr = LogisticRegression()
model_lr.fit(X_train, y_train)
y_pred = model_lr.predict(X_test)

accuracy_score(y_test, y_pred)

0.7787878787878788

## Drzewo decyzyjne

In [55]:
from sklearn.tree import DecisionTreeClassifier

model_dt = DecisionTreeClassifier()
model_dt.fit(X_train, y_train)
y_pred = model_dt.predict(X_test)

accuracy_score(y_test, y_pred)

0.6606060606060606

## Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

models = [MultinomialNB(), GaussianNB(), BernoulliNB()]
scores = []

for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    score = accuracy_score(y_test, y_pred)
    scores.append(score)
    
scores

## SVM

In [57]:
from sklearn.svm import SVC

model_svm = SVC()
model_svm.fit(X_train, y_train)
y_pred = model_svm.predict(X_test)

accuracy_score(y_test, y_pred)

0.7333333333333333

## KNN

In [56]:
from sklearn.neighbors import KNeighborsClassifier

model_knn = KNeighborsClassifier()
model_knn.fit(X_train, y_train)
y_pred = model_knn.predict(X_test)

accuracy_score(y_test, y_pred)

0.5454545454545454

## RandomForest

In [61]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(n_estimators=300)
model_rf.fit(X_train, y_train)
y_pred = model_rf.predict(X_test)

accuracy_score(y_test, y_pred)

0.8090909090909091

In [60]:
n_estimators_list = [150, 200, 250, 300, 350]
scores = []

for n in n_estimators_list:
    
    model_rf = RandomForestClassifier(n_estimators=n)
    model_rf.fit(X_train, y_train)
    y_pred = model_rf.predict(X_test)

    score = accuracy_score(y_test, y_pred)
    scores.append(score)
    
scores

[0.7848484848484848,
 0.8090909090909091,
 0.806060606060606,
 0.8272727272727273,
 0.8151515151515152]