In [23]:
import spacy
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

In [2]:
reviews=pd.read_csv('IMDB Dataset.csv')

In [24]:
reviews=pd.read_csv('IMDB_Dataset_tokenized.csv')

In [18]:
reviews.head()

Unnamed: 0,review,sentiment,processed_tokens,cleaned_review,vector
0,One of the other reviewers has mentioned that ...,1,"['one', 'reviewer', 'mention', 'watch', '1', '...",one reviewer mention watch 1 oz episode hook r...,[-0.42966935 1.1056015 -2.4001198 0.388375...
1,A wonderful little production. <br /><br />The...,1,"['wonderful', 'little', 'production', 'film', ...",wonderful little production film technique una...,[-4.44204330e-01 5.64397991e-01 -1.67188191e+...
2,I thought this was a wonderful way to spend ti...,1,"['think', 'wonderful', 'way', 'spend', 'time',...",think wonderful way spend time hot summer week...,[-0.1606945 1.3970901 -1.9049683 -0.252108...
3,Basically there's a family where a little boy ...,0,"['basically', 'family', 'little', 'boy', 'jake...",basically family little boy jake think zombie ...,[-0.5657177 0.71835285 -2.0717964 -1.385140...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,"['petter', 'mattei', 'love', 'time', 'money', ...",petter mattei love time money visually stunnin...,[-0.88697356 0.919063 -2.0504785 -1.20755 ...


In [4]:
nlp = spacy.load('en_core_web_md')

In [5]:
import nltk

# Set the path explicitly
nltk_data_path = "/opt/anaconda3/envs/machine_learning_python/nltk_data"
nltk.data.path = [nltk_data_path]

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tag import pos_tag
import re
import contractions

# Download required NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('omw-1.4')  # Open Multilingual Wordnet
nltk.download('averaged_perceptron_tagger_eng')

def preprocess_text(text, use_stemming=False):
    """
    Comprehensive text preprocessing function
    """
    # Convert to lowercase and clean HTML/whitespace
    text = str(text).lower()
    text = re.sub(r'<[^>]+>', ' ', text)
    
    # Expand contractions
    text = contractions.fix(text)
    
    # Remove punctuation except apostrophes
    text = re.sub(r'[^\w\s]', ' ', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    if use_stemming:
        # Stemming
        stemmer = PorterStemmer()
        processed_tokens = [stemmer.stem(token) for token in tokens]
    else:
        # Lemmatization with POS tagging
        lemmatizer = WordNetLemmatizer()
        
        # Get POS tags using NLTK's pos_tag
        pos_tags = pos_tag(tokens)
        
        processed_tokens = []
        for word, tag in pos_tags:
            # Convert Penn Treebank tags to WordNet POS tags
            if tag.startswith('J'):
                pos = 'a'  # adjective
            elif tag.startswith('V'):
                pos = 'v'  # verb
            elif tag.startswith('N'):
                pos = 'n'  # noun
            elif tag.startswith('R'):
                pos = 'r'  # adverb
            else:
                pos = 'n'  # default to noun
                
            processed_tokens.append(lemmatizer.lemmatize(word, pos=pos))
    
    return processed_tokens



[nltk_data] Downloading package stopwords to
[nltk_data]     /opt/anaconda3/envs/machine_learning_python/nltk_data.
[nltk_data]     ..
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /opt/anaconda3/envs/machine_learning_python/nltk_data.
[nltk_data]     ..
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /opt/anaconda3/envs/machine_learning_python/nltk_data.
[nltk_data]     ..
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     /opt/anaconda3/envs/machine_learning_python/nltk_data.
[nltk_data]     ..
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /opt/anaconda3/envs/machine_learning_python/nltk_data.
[nltk_data]     ..
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] D

In [10]:
text = "I can't believe how great this movie was! The actors were running and jumping everywhere!"

# Try both stemming and lemmatization
stemmed_tokens = preprocess_text(text, use_stemming=True)
lemmatized_tokens = preprocess_text(text, use_stemming=False)

print("Original:", text)
print("\nStemmed:", stemmed_tokens)
print("\nLemmatized:", lemmatized_tokens)

Original: I can't believe how great this movie was! The actors were running and jumping everywhere!

Stemmed: ['believ', 'great', 'movi', 'actor', 'run', 'jump', 'everywher']

Lemmatized: ['believe', 'great', 'movie', 'actor', 'run', 'jumping', 'everywhere']


In [11]:
reviews['processed_tokens'] = reviews['review'].apply(lambda x: preprocess_text(x, use_stemming=False))

In [15]:
reviews['sentiment'] = reviews['sentiment'].apply(lambda x: 1 if x=='positive' else 0)

In [20]:
reviews['cleaned_review']=reviews['processed_tokens'].apply(lambda tokens: ' '.join([token for token in tokens]))

In [None]:
def get_doc_vector(text):
    doc = nlp(text)
    return doc.vector

In [11]:
reviews['vector']=reviews['cleaned_review'].apply(get_doc_vector)

In [45]:
reviews.to_csv('IMDB_Dataset_tokenized.csv',index=False)

In [17]:
clf = make_pipeline(StandardScaler(),PCA(.95),SVC())

In [14]:
vectors_array = np.vstack(reviews['vector'].values)

In [61]:
X_train,X_test,y_train,y_test = train_test_split(vectors_array,reviews['sentiment'].values,test_size=0.2,random_state=42)

In [62]:
X_val,X_test,y_val,y_test = train_test_split(X_test,y_test,test_size=0.5,random_state=42)

In [18]:
clf.fit(X_train,y_train)

In [19]:
clf.score(X_val,y_val)

0.8356

In [20]:
clf.score(X_train,y_train)

0.899875

In [48]:
clf_2 = make_pipeline(StandardScaler(),PCA(.95),MLPClassifier(hidden_layer_sizes=(100,50,),learning_rate='adaptive'))

In [51]:
clf_2.fit(X_train,y_train)

In [52]:
clf_2.score(X_val,y_val)

0.7954

In [28]:
clf_2.score(X_train,y_train)

1.0

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

In [7]:
X_train_2,X_test_2,y_train_2,y_test_2 = train_test_split(reviews['cleaned_review'].values,reviews['sentiment'].values,test_size=0.2,random_state=3317)

In [8]:
X_val_2,X_test_2,y_val_2,y_test_2 = train_test_split(X_test_2,y_test_2,test_size=0.5,random_state=3317)

In [9]:
X_train_2 = vectorizer.fit_transform(X_train_2)

In [10]:
X_val_2 = vectorizer.transform(X_val_2)
X_test_2 = vectorizer.transform(X_test_2)

In [11]:
from sklearn.linear_model import LogisticRegression

In [15]:
clf_3 = LogisticRegression(penalty='l1',solver='liblinear')

In [16]:
clf_3.fit(X_train_2.toarray(),y_train_2)

In [17]:
clf_3.score(X_val_2.toarray(),y_val_2)

0.8804