In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle


In [2]:
df = pd.read_csv('data/IMDB_Dataset.csv')
df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [3]:
# Pre-trained natural language processing pipeline 
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_sm

nlp = spacy.load('en_core_web_sm')

In [4]:
sentiments = df.drop("review", axis=1)
reviews = df.drop("sentiment", axis=1)
sentiments

Unnamed: 0,sentiment
0,positive
1,positive
2,positive
3,negative
4,positive
...,...
49995,positive
49996,negative
49997,negative
49998,negative


In [5]:
import re

def clean_text(text):
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'http\S+', '', text)  
    text = re.sub(r'@\w+', '', text)  
    text = re.sub(r'#\w+', '', text) 
    text = re.sub(r'[^\w\s]', '', text)  
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    text = text.strip()
    return text

reviews["review"] = reviews["review"].apply(clean_text)

In [6]:
def lemmatize_and_remove_pronouns(text):
    doc = nlp(text)
    lemmatize_tokens = [token.lemma_ for token in doc 
                        if not token.is_stop 
                        and not token.is_punct
                        and token.pos_ not in ['PRON', 'DET']] 

    return lemmatize_tokens

reviews["lemmatized_tokens"] = reviews["review"].apply(lemmatize_and_remove_pronouns)
reviews.head()

Unnamed: 0,review,lemmatized_tokens
0,one of the other reviewers has mentioned that ...,"[reviewer, mention, watch, , oz, episode, ll,..."
1,a wonderful little production the filming te...,"[wonderful, little, production, , film, tech..."
2,i thought this was a wonderful way to spend ti...,"[think, wonderful, way, spend, time, hot, summ..."
3,basically theres a family where a little boy j...,"[basically, s, family, little, boy, jake, thin..."
4,petter matteis love in the time of money is a ...,"[petter, matteis, love, time, money, visually,..."


In [7]:
lemmatized_reviews = reviews["lemmatized_tokens"]
custom_stopwords = set("""
x y year 1990 2005 21 233 say will your yours yourself yourselves you yond yonder yon ye yet z zillion j u umpteen usually us username uponed upons uponing upon ups upping upped up unto until unless unlike unliker unlikest under underneath use used usedest r rath rather rathest rathe re relate related relatively regarding really res respecting respectively q quite que qua n neither neaths neath nethe nethermost necessary necessariest necessarier never nevertheless nigh nighest nigher nine noone nobody nobodies nowhere nowheres no noes nor nos no-one none not notwithstanding nothings nothing nathless natheless t ten tills till tilled tilling to towards toward towardest towarder together too thy thyself thus than that those thou though thous thouses thoroughest thorougher thorough thoroughly thru thruer thruest thro through throughout throughest througher thine this thises they thee the then thence thenest thener them themselves these therer there thereby therest thereafter therein thereupon therefore their theirs thing things three two o oh owt owning owned own owns others other otherwise otherwisest otherwiser of often oftener oftenest off offs offest one ought oughts our ours ourselves ourself out outest outed outwith outs outside over overallest overaller overalls overall overs or orer orest on oneself onest ons onto a atween at athwart atop afore afterward afterwards after afterest afterer ain an any anything anybody anyone anyhow anywhere anent anear and andor another around ares are aest aer against again accordingly abaft abafter abaftest abovest above abover abouter aboutest about aid amidst amid among amongst apartest aparter apart appeared appears appear appearing appropriating appropriate appropriatest appropriates appropriater appropriated already always also along alongside although almost all allest aller allyou alls albeit awfully as aside asides aslant ases astrider astride astridest astraddlest astraddler astraddle availablest availabler available aughts aught vs v variousest variouser various via vis-a-vis vis-a-viser vis-a-visest viz very veriest verier versus k g go gone good got gotta gotten get gets getting b by byandby by-and-by bist both but buts be beyond because became becomes become becoming becomings becominger becomingest behind behinds before beforehand beforehandest beforehander bettered betters better bettering betwixt between beneath been below besides beside m my myself mucher muchest much must musts musths musth main make mayest many mauger maugre me meanwhiles meanwhile mostly most moreover more might mights midst midsts h huh humph he hers herself her hereby herein hereafters hereafter hereupon hence hadst had having haves have has hast hardly hae hath him himself hither hitherest hitherer his how-do-you-do however how howbeit howdoyoudo hoos hoo w woulded woulding would woulds was wast we wert were with withal without within why what whatever whateverer whateverest whatsoeverer whatsoeverest whatsoever whence whencesoever whenever whensoever when whenas whether wheen whereto whereupon wherever whereon whereof where whereby wherewithal wherewith whereinto wherein whereafter whereas wheresoever wherefrom which whichever whichsoever whilst while whiles whithersoever whither whoever whosoever whoso whose whomever s syne syn shalling shall shalled shalls shoulding should shoulded shoulds she sayyid sayid said saider saidest same samest sames samer saved sans sanses sanserifs sanserif so soer soest sobeit someone somebody somehow some somewhere somewhat something sometimest sometimes sometimer sometime several severaler severalest serious seriousest seriouser senza send sent seem seems seemed seemingest seeminger seemings seven summat sups sup supping supped such since sine sines sith six stop stopped p plaintiff plenty plenties please pleased pleases per perhaps particulars particularly particular particularest particularer pro providing provides provided provide probably l layabout layabouts latter latterest latterer latterly latters lots lotting lotted lot lest less ie ifs if i info information itself its it is idem idemer idemest immediate immediately immediatest immediater in inwards inwardest inwarder inward inwardest inwarder inwards inasmuch into instead insofar indicates indicated indicate indicating indeed inc f fact facts fs figupon figupons figuponing figuponed few fewer fewest frae from failing failings five furthers furtherer furthered furtherest further furthering furthermore fourscore followthrough for forwhy fornenst formerly former formerer formerest formers forbye forby fore forever forer fores four d ddays dday do doing doings doe does doth downwarder downwardest downward downwards downs done doner dones donest dos dost did differentest differenter different describing describe describes described despiting despites despited despite during c cum circa chez cer certain certainest certainer cest canst cannot cant cants canting cantest canted co could couldst comeon comeons come-ons come-on concerning concerninger concerningest consequently considering e eg eight either even evens evenser evensest evened evenest ever everyone everything everybody everywhere every ere each et etc else elsewhere else ex excepted excepts except excepting exes enough
""".split()) # downloaded from kaggle

def remove_stopwords(tokens):
    filtered_tokens = [token for token in tokens if token.lower() not in custom_stopwords and len(token) >= 3]
    return filtered_tokens

reviews["final_tokens"] = reviews["lemmatized_tokens"].apply(remove_stopwords)
reviews.head()

Unnamed: 0,review,lemmatized_tokens,final_tokens
0,one of the other reviewers has mentioned that ...,"[reviewer, mention, watch, , oz, episode, ll,...","[reviewer, mention, watch, episode, hook, righ..."
1,a wonderful little production the filming te...,"[wonderful, little, production, , film, tech...","[wonderful, little, production, film, techniqu..."
2,i thought this was a wonderful way to spend ti...,"[think, wonderful, way, spend, time, hot, summ...","[think, wonderful, way, spend, time, hot, summ..."
3,basically theres a family where a little boy j...,"[basically, s, family, little, boy, jake, thin...","[basically, family, little, boy, jake, think, ..."
4,petter matteis love in the time of money is a ...,"[petter, matteis, love, time, money, visually,...","[petter, matteis, love, time, money, visually,..."


In [8]:
final_tokens = reviews.drop(columns = ["review", "lemmatized_tokens"])
final_tokens.head()


Unnamed: 0,final_tokens
0,"[reviewer, mention, watch, episode, hook, righ..."
1,"[wonderful, little, production, film, techniqu..."
2,"[think, wonderful, way, spend, time, hot, summ..."
3,"[basically, family, little, boy, jake, think, ..."
4,"[petter, matteis, love, time, money, visually,..."


In [9]:
sentiments = sentiments.replace({"positive": 1, "negative": 0}).astype(int)
sentiments

  sentiments = sentiments.replace({"positive": 1, "negative": 0}).astype(int)


Unnamed: 0,sentiment
0,1
1,1
2,1
3,0
4,1
...,...
49995,1
49996,0
49997,0
49998,0


In [11]:
sentiments.to_pickle("data_2/sentiments.pkl")
final_tokens.to_pickle("data_2/final_tokens.pkl")

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(final_tokens, sentiments, test_size=0.30, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.33333, random_state=42)


In [13]:
X_train["final_reviews"] = X_train["final_tokens"].apply(lambda x: " ".join(x))
X_test["final_reviews"] = X_test["final_tokens"].apply(lambda x: " ".join(x))
X_val["final_reviews"] = X_val["final_tokens"].apply(lambda x: " ".join(x))
X_train

Unnamed: 0,final_tokens,final_reviews
38094,"[love, train, stomach, movie, premise, steal, ...",love train stomach movie premise steal locomot...
40624,"[ppv, like, wrestlemania, later, wwe, cram, ma...",ppv like wrestlemania later wwe cram match mat...
49425,"[find, right, word, everybodys, problem, vaude...",find right word everybodys problem vaudevillet...
35734,"[suprise, movie, high, rating, imdb, movie, ea...",suprise movie high rating imdb movie easily ro...
41708,"[ill, start, confess, tend, enjoy, action, mov...",ill start confess tend enjoy action movie mili...
...,...,...
11284,"[shadow, magic, recapture, joy, amazement, mov...",shadow magic recapture joy amazement movie aud...
44732,"[find, movie, enjoyable, fairly, entertain, ch...",find movie enjoyable fairly entertain characte...
38158,"[avoid, terrible, movie, exciting, pointless, ...",avoid terrible movie exciting pointless murder...
860,"[production, surprise, absolutely, love, obscu...",production surprise absolutely love obscure ea...


In [14]:
X_train_tokenized = X_train.drop("final_reviews", axis=1)
X_train = X_train.drop("final_tokens", axis=1)
X_test_tokenized = X_test.drop("final_reviews", axis=1)
X_test = X_test.drop("final_tokens", axis=1)
X_val_tokenized = X_val.drop("final_reviews", axis=1)
X_val = X_val.drop("final_tokens", axis=1)
X_val

Unnamed: 0,final_reviews
45040,big national lampoon hit remain animal house r...
20713,great filmmaker piece garbage minute sheer ted...
9836,intrigue nasty boss character actual bosss dau...
44734,problem child goofiest movie bad people disagr...
40160,shame movie cinematography plot support work s...
...,...
31387,film last minute maximum interesting conceptth...
35406,fine musical timeless worth see time delicious...
3375,matter hurt movie beenmaybe mislead countless ...
30489,like movie see imply publish review actually m...


In [16]:
X_train.to_pickle("data_2/X_train.pkl")
X_test.to_pickle("data_2/X_test.pkl")
X_val.to_pickle("data_2/X_val.pkl")
X_train_tokenized.to_pickle("data_2/X_train_tokenized.pkl")
X_test_tokenized.to_pickle("data_2/X_test_tokenized.pkl")
X_val_tokenized.to_pickle("data_2/X_val_tokenized.pkl")
y_train.to_pickle("data_2/y_train.pkl")
y_test.to_pickle("data_2/y_test.pkl")
y_val.to_pickle("data_2/y_val.pkl")





In [17]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


In [19]:
vectorizer = CountVectorizer(ngram_range=(1, 2))
X_train_vectorized_onegram = vectorizer.fit_transform(X_train["final_reviews"])
X_test_vectorized_onegram = vectorizer.transform(X_test["final_reviews"])
X_val_vectorized_onegram = vectorizer.transform(X_val["final_reviews"])

In [20]:
svm_model = SVC()
knn_model = KNeighborsClassifier()
logreg_model = LogisticRegression(max_iter=3000)

In [None]:
svm_model.fit(X_train_vectorized_onegram, y_train)
knn_model.fit(X_train_vectorized_onegram, y_train)
logreg_model.fit(X_train_vectorized_onegram, y_train)

  y = column_or_1d(y, warn=True)
  return self._fit(X, y)
  y = column_or_1d(y, warn=True)


In [22]:
y_val_pred_svm = svm_model.predict(X_val_vectorized_onegram)
y_val_pred_knn = knn_model.predict(X_val_vectorized_onegram)
y_val_pred_logreg = logreg_model.predict(X_val_vectorized_onegram)

val_acc_svm = accuracy_score(y_val, y_val_pred_svm)
val_acc_knn = accuracy_score(y_val, y_val_pred_knn)
val_acc_logreg = accuracy_score(y_val, y_val_pred_logreg)

print(f"Validation Accuracy - SVM: {val_acc_svm:.4f}")
print(f"Validation Accuracy - KNN: {val_acc_knn:.4f}")
print(f"Validation Accuracy - Logistic Regression: {val_acc_logreg:.4f}")

Validation Accuracy - SVM: 0.8782
Validation Accuracy - KNN: 0.5686
Validation Accuracy - Logistic Regression: 0.8876


In [25]:
with open("models_2/count_vectorizer_one_two/svm_model_one_two_gram.pkl", "wb") as f:
    pickle.dump(svm_model, f)

with open("models_2/count_vectorizer_one_two/logistic_model_one_two_gram.pkl", "wb") as f:
    pickle.dump(logreg_model, f)

with open("models_2/count_vectorizer_one_two/knn_model_one_two_gram.pkl", "wb") as f:
    pickle.dump(knn_model, f)

with open("models_2/count_vectorizer_one_two/count_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

In [26]:
tfidf_vectorizer = TfidfVectorizer(  
    stop_words=None,    
    ngram_range=(1, 1)  
)

X_train_tfidf_vectorized_onegram = tfidf_vectorizer.fit_transform(X_train["final_reviews"])

X_test_tfid_vectorized_onegram = tfidf_vectorizer.transform(X_test["final_reviews"])
X_val_tfid_vectorized_onegram = tfidf_vectorizer.transform(X_val["final_reviews"])

X_train_tfidf_vectorized_onegram.shape

(35000, 114290)

In [28]:
svm_model = SVC()
knn_model = KNeighborsClassifier()
logreg_model = LogisticRegression(max_iter=6000)

In [29]:
svm_model.fit(X_train_tfidf_vectorized_onegram, y_train)
knn_model.fit(X_train_tfidf_vectorized_onegram, y_train)
logreg_model.fit(X_train_tfidf_vectorized_onegram, y_train)

  y = column_or_1d(y, warn=True)
  return self._fit(X, y)
  y = column_or_1d(y, warn=True)


In [30]:
y_val_pred_svm = svm_model.predict(X_val_tfid_vectorized_onegram)
y_val_pred_knn = knn_model.predict(X_val_tfid_vectorized_onegram)
y_val_pred_logreg = logreg_model.predict(X_val_tfid_vectorized_onegram)

val_acc_svm = accuracy_score(y_val, y_val_pred_svm)
val_acc_knn = accuracy_score(y_val, y_val_pred_knn)
val_acc_logreg = accuracy_score(y_val, y_val_pred_logreg)

print(f"Validation Accuracy - SVM: {val_acc_svm:.4f}")
print(f"Validation Accuracy - KNN: {val_acc_knn:.4f}")
print(f"Validation Accuracy - Logistic Regression: {val_acc_logreg:.4f}")

Validation Accuracy - SVM: 0.8912
Validation Accuracy - KNN: 0.7794
Validation Accuracy - Logistic Regression: 0.8898


In [35]:
tfidf_vectorizer = TfidfVectorizer(  
    stop_words=None,    
    ngram_range=(1, 1)  
)

X_train_tfidf_vectorized_onegram_rf = tfidf_vectorizer.fit_transform(X_train["final_reviews"])

X_test_tfid_vectorized_onegram = tfidf_vectorizer.transform(X_test["final_reviews"])
X_val_tfid_vectorized_onegram = tfidf_vectorizer.transform(X_val["final_reviews"])

X_train_tfidf_vectorized_onegram_rf.shape

(35000, 114290)

In [36]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
rf = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=200)
rf.fit(X_train_tfidf_vectorized_onegram_rf, y_train)
y_pred_train = rf.predict(X_train_tfidf_vectorized_onegram)
y_pred_val = rf.predict(X_val_tfid_vectorized_onegram)

print("Accuracy train:", accuracy_score(y_train, y_pred_train))
print("Classification Report train:\n", classification_report(y_train, y_pred_train))

print("Accuracy val:", accuracy_score(y_val, y_pred_val))
print("Classification Report val:\n", classification_report(y_val, y_pred_val))

  return fit_method(estimator, *args, **kwargs)


Accuracy train: 0.9998285714285714
Classification Report train:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     17589
           1       1.00      1.00      1.00     17411

    accuracy                           1.00     35000
   macro avg       1.00      1.00      1.00     35000
weighted avg       1.00      1.00      1.00     35000

Accuracy val: 0.8548
Classification Report val:
               precision    recall  f1-score   support

           0       0.86      0.85      0.85      2493
           1       0.85      0.86      0.86      2507

    accuracy                           0.85      5000
   macro avg       0.85      0.85      0.85      5000
weighted avg       0.85      0.85      0.85      5000



In [38]:
best_k = 1
best_acc = 0
best_knn_one_grams_model = None

tfidf_vectorizer = TfidfVectorizer(  
    stop_words=None,    
    ngram_range=(1, 1)  
)

X_train_tfidf_vectorized_onegram = tfidf_vectorizer.fit_transform(X_train["final_reviews"])

X_test_tfid_vectorized_onegram = tfidf_vectorizer.transform(X_test["final_reviews"])
X_val_tfid_vectorized_onegram = tfidf_vectorizer.transform(X_val["final_reviews"])

for k in range(1, 21):
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train_tfidf_vectorized_onegram, y_train)
    y_pred = model.predict(X_val_tfid_vectorized_onegram)
    acc = accuracy_score(y_val, y_pred)
    if acc > best_acc:
        best_acc = acc
        best_k = k
        best_knn_one_grams_model = model
    print(f"k = {k}, accuracy = {acc}")



  return self._fit(X, y)


k = 1, accuracy = 0.7368


  return self._fit(X, y)


k = 2, accuracy = 0.744


  return self._fit(X, y)


k = 3, accuracy = 0.7666


  return self._fit(X, y)


k = 4, accuracy = 0.769


  return self._fit(X, y)


k = 5, accuracy = 0.7794


  return self._fit(X, y)


k = 6, accuracy = 0.7788


  return self._fit(X, y)


k = 7, accuracy = 0.7848


  return self._fit(X, y)


k = 8, accuracy = 0.7872


  return self._fit(X, y)


k = 9, accuracy = 0.7854


  return self._fit(X, y)


k = 10, accuracy = 0.7832


  return self._fit(X, y)


k = 11, accuracy = 0.786


  return self._fit(X, y)


k = 12, accuracy = 0.787


  return self._fit(X, y)


k = 13, accuracy = 0.7882


  return self._fit(X, y)


k = 14, accuracy = 0.7934


  return self._fit(X, y)


k = 15, accuracy = 0.7922


  return self._fit(X, y)


k = 16, accuracy = 0.7898


  return self._fit(X, y)


k = 17, accuracy = 0.7924


  return self._fit(X, y)


k = 18, accuracy = 0.7938


  return self._fit(X, y)


k = 19, accuracy = 0.7954


  return self._fit(X, y)


k = 20, accuracy = 0.7972


In [39]:
for k in range(21, 41):
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train_tfidf_vectorized_onegram, y_train)
    y_pred = model.predict(X_val_tfid_vectorized_onegram)
    acc = accuracy_score(y_val, y_pred)
    if acc > best_acc:
        best_acc = acc
        best_k = k
        best_knn_one_grams_model = model
    print(f"k = {k}, accuracy = {acc}")


print(f"best k = {best_k}, best accuracy = {best_acc}")

  return self._fit(X, y)


k = 21, accuracy = 0.793


  return self._fit(X, y)


k = 22, accuracy = 0.7972


  return self._fit(X, y)


k = 23, accuracy = 0.7936


  return self._fit(X, y)


k = 24, accuracy = 0.7946


  return self._fit(X, y)


k = 25, accuracy = 0.7902


  return self._fit(X, y)


k = 26, accuracy = 0.7918


  return self._fit(X, y)


k = 27, accuracy = 0.793


  return self._fit(X, y)


k = 28, accuracy = 0.7934


  return self._fit(X, y)


k = 29, accuracy = 0.789


  return self._fit(X, y)


k = 30, accuracy = 0.797


  return self._fit(X, y)


k = 31, accuracy = 0.7924


  return self._fit(X, y)


k = 32, accuracy = 0.7958


  return self._fit(X, y)


k = 33, accuracy = 0.7948


  return self._fit(X, y)


k = 34, accuracy = 0.7948


  return self._fit(X, y)


k = 35, accuracy = 0.7934


  return self._fit(X, y)


k = 36, accuracy = 0.7938


  return self._fit(X, y)


k = 37, accuracy = 0.7932


  return self._fit(X, y)


k = 38, accuracy = 0.7922


  return self._fit(X, y)


k = 39, accuracy = 0.793


  return self._fit(X, y)


k = 40, accuracy = 0.7904
best k = 20, best accuracy = 0.7972


In [40]:
for k in range(41, 101):
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train_tfidf_vectorized_onegram, y_train)
    y_pred = model.predict(X_val_tfid_vectorized_onegram)
    acc = accuracy_score(y_val, y_pred)
    if acc > best_acc:
        best_acc = acc
        best_k = k
        best_knn_one_grams_model = model
    print(f"k = {k}, accuracy = {acc}")


print(f"best k = {best_k}, best accuracy = {best_acc}")

  return self._fit(X, y)


k = 41, accuracy = 0.7934


  return self._fit(X, y)


k = 42, accuracy = 0.7912


  return self._fit(X, y)


k = 43, accuracy = 0.794


  return self._fit(X, y)


k = 44, accuracy = 0.794


  return self._fit(X, y)


k = 45, accuracy = 0.7958


  return self._fit(X, y)


k = 46, accuracy = 0.7932


  return self._fit(X, y)


k = 47, accuracy = 0.7958


  return self._fit(X, y)


k = 48, accuracy = 0.795


  return self._fit(X, y)


k = 49, accuracy = 0.7962


  return self._fit(X, y)


k = 50, accuracy = 0.7958


  return self._fit(X, y)


k = 51, accuracy = 0.798


  return self._fit(X, y)


k = 52, accuracy = 0.7948


  return self._fit(X, y)


k = 53, accuracy = 0.7984


  return self._fit(X, y)


k = 54, accuracy = 0.7958


  return self._fit(X, y)


k = 55, accuracy = 0.798


  return self._fit(X, y)


k = 56, accuracy = 0.7942


  return self._fit(X, y)


k = 57, accuracy = 0.7982


  return self._fit(X, y)


k = 58, accuracy = 0.7946


  return self._fit(X, y)


k = 59, accuracy = 0.7954


  return self._fit(X, y)


k = 60, accuracy = 0.7928


  return self._fit(X, y)


k = 61, accuracy = 0.793


  return self._fit(X, y)


k = 62, accuracy = 0.7938


  return self._fit(X, y)


k = 63, accuracy = 0.793


  return self._fit(X, y)


k = 64, accuracy = 0.7938


  return self._fit(X, y)


k = 65, accuracy = 0.7928


  return self._fit(X, y)


k = 66, accuracy = 0.7956


  return self._fit(X, y)


k = 67, accuracy = 0.795


  return self._fit(X, y)


k = 68, accuracy = 0.7968


  return self._fit(X, y)


k = 69, accuracy = 0.7978


  return self._fit(X, y)


k = 70, accuracy = 0.795


  return self._fit(X, y)


k = 71, accuracy = 0.7984


  return self._fit(X, y)


k = 72, accuracy = 0.7978


  return self._fit(X, y)


k = 73, accuracy = 0.7984


  return self._fit(X, y)


k = 74, accuracy = 0.7992


  return self._fit(X, y)


k = 75, accuracy = 0.8


  return self._fit(X, y)


k = 76, accuracy = 0.8002


  return self._fit(X, y)


k = 77, accuracy = 0.7982


  return self._fit(X, y)


k = 78, accuracy = 0.7994


  return self._fit(X, y)


k = 79, accuracy = 0.7982


  return self._fit(X, y)


k = 80, accuracy = 0.7996


  return self._fit(X, y)


k = 81, accuracy = 0.7994


  return self._fit(X, y)


k = 82, accuracy = 0.7994


  return self._fit(X, y)


k = 83, accuracy = 0.7986


  return self._fit(X, y)


k = 84, accuracy = 0.7994


  return self._fit(X, y)


k = 85, accuracy = 0.7978


  return self._fit(X, y)


k = 86, accuracy = 0.7986


  return self._fit(X, y)


k = 87, accuracy = 0.798


  return self._fit(X, y)


k = 88, accuracy = 0.8012


  return self._fit(X, y)


k = 89, accuracy = 0.7994


  return self._fit(X, y)


k = 90, accuracy = 0.8032


  return self._fit(X, y)


k = 91, accuracy = 0.8026


  return self._fit(X, y)


k = 92, accuracy = 0.8016


  return self._fit(X, y)


k = 93, accuracy = 0.8002


  return self._fit(X, y)


k = 94, accuracy = 0.8012


  return self._fit(X, y)


k = 95, accuracy = 0.7994


  return self._fit(X, y)


k = 96, accuracy = 0.8016


  return self._fit(X, y)


k = 97, accuracy = 0.801


  return self._fit(X, y)


k = 98, accuracy = 0.8032


  return self._fit(X, y)


k = 99, accuracy = 0.8024


  return self._fit(X, y)


k = 100, accuracy = 0.8026
best k = 90, best accuracy = 0.8032


In [41]:
for k in range(101, 120, 2):
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train_tfidf_vectorized_onegram, y_train)
    y_pred = model.predict(X_val_tfid_vectorized_onegram)
    acc = accuracy_score(y_val, y_pred)
    if acc > best_acc:
        best_acc = acc
        best_k = k
        best_knn_one_grams_model = model
    print(f"k = {k}, accuracy = {acc}")


print(f"best k = {best_k}, best accuracy = {best_acc}")

  return self._fit(X, y)


k = 101, accuracy = 0.8004


  return self._fit(X, y)


k = 103, accuracy = 0.8018


  return self._fit(X, y)


k = 105, accuracy = 0.8018


  return self._fit(X, y)


k = 107, accuracy = 0.8036


  return self._fit(X, y)


k = 109, accuracy = 0.8028


  return self._fit(X, y)


k = 111, accuracy = 0.8018


  return self._fit(X, y)


k = 113, accuracy = 0.8014


  return self._fit(X, y)


k = 115, accuracy = 0.8008


  return self._fit(X, y)


k = 117, accuracy = 0.8004


  return self._fit(X, y)


k = 119, accuracy = 0.8008
best k = 107, best accuracy = 0.8036


In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
# read data form pickle
X_train = pd.read_pickle('data_2/X_train.pkl')
X_test = pd.read_pickle('data_2/X_test.pkl')
y_train = pd.read_pickle('data_2/y_train.pkl')
y_test = pd.read_pickle('data_2/y_test.pkl')
X_val = pd.read_pickle('data_2/X_val.pkl')
y_val = pd.read_pickle('data_2/y_val.pkl')

tfidf_vectorizer = TfidfVectorizer(  
    stop_words=None,    
    ngram_range=(1, 1)  
)

X_train_tfidf_vectorized_onegram = tfidf_vectorizer.fit_transform(X_train["final_reviews"])

X_test_tfid_vectorized_onegram = tfidf_vectorizer.transform(X_test["final_reviews"])
X_val_tfid_vectorized_onegram = tfidf_vectorizer.transform(X_val["final_reviews"])

X_train_tfidf_vectorized_onegram.shape

(35000, 114290)

In [13]:
svm_model = SVC()
logreg_model = LogisticRegression(max_iter=6000)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=200)

svm_model.fit(X_train_tfidf_vectorized_onegram, y_train)
logreg_model.fit(X_train_tfidf_vectorized_onegram, y_train)
rf_model.fit(X_train_tfidf_vectorized_onegram, y_train)



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)


In [14]:
val_pred_svm = svm_model.predict(X_val_tfid_vectorized_onegram)
val_pred_logreg = logreg_model.predict(X_val_tfid_vectorized_onegram)
val_pred_rf = rf_model.predict(X_val_tfid_vectorized_onegram)

stacked_val_features = np.column_stack((val_pred_svm, val_pred_logreg, val_pred_rf))

In [15]:
train_pred_svm = svm_model.predict(X_train_tfidf_vectorized_onegram)
train_pred_logreg = logreg_model.predict(X_train_tfidf_vectorized_onegram)
train_pred_rf = rf_model.predict(X_train_tfidf_vectorized_onegram)

stacked_train_features = np.column_stack((train_pred_svm, train_pred_logreg, train_pred_rf))

meta_model = LogisticRegression(max_iter=5000)
meta_model.fit(stacked_train_features, y_train)

  y = column_or_1d(y, warn=True)


In [16]:
y_val_pred_stacking = meta_model.predict(stacked_val_features)

from sklearn.metrics import accuracy_score, classification_report
val_acc_stacking = accuracy_score(y_val, y_val_pred_stacking)

print(f"Validation Accuracy - Stacking Classifier: {val_acc_stacking:.4f}")
print("Classification Report:\n", classification_report(y_val, y_val_pred_stacking))


Validation Accuracy - Stacking Classifier: 0.8548
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.85      0.85      2493
           1       0.85      0.86      0.86      2507

    accuracy                           0.85      5000
   macro avg       0.85      0.85      0.85      5000
weighted avg       0.85      0.85      0.85      5000



In [18]:
test_pred_svm = svm_model.predict(X_test_tfid_vectorized_onegram)
test_pred_logreg = logreg_model.predict(X_test_tfid_vectorized_onegram)
test_pred_rf = rf_model.predict(X_test_tfid_vectorized_onegram)

stacked_test_features = np.column_stack((test_pred_svm, test_pred_logreg, test_pred_rf))



In [19]:
y_test_pred_stacking = meta_model.predict(stacked_test_features)

from sklearn.metrics import accuracy_score, classification_report
test_acc_stacking = accuracy_score(y_test, y_test_pred_stacking)

print(f"testidation Accuracy - Stacking Classifier: {test_acc_stacking:.4f}")
print("Classification Report:\n", classification_report(y_test, y_test_pred_stacking))

testidation Accuracy - Stacking Classifier: 0.8488
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.85      0.85      4918
           1       0.85      0.85      0.85      5082

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

