In [2]:
import re
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC

In [3]:
# append all reviws to a single list.

reviews_train = []
for line in open('../Old_data/Python/Dataset/aclImdb/movie_data/full_train.txt', 'r'):
    reviews_train.append(line.strip())
    
reviews_test = []
for line in open('../Old_data/Python/Dataset/aclImdb/movie_data/full_test.txt', 'r'):
    reviews_test.append(line.strip())

In [4]:
reviews_train[0]

'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

In [5]:
# clear the text 
# replace special characters

REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]") # creating object of special object's
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)") # creating object of special object's
 
def preprocess_reviews(reviews):
    reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews] # using re object to substitute
    reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews] # using re object to substitute
    
    return reviews

reviews_train_clean = preprocess_reviews(reviews_train)
reviews_test_clean = preprocess_reviews(reviews_test)

In [6]:
reviews_train_clean[0]

'bromwell high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my 35 years in the teaching profession lead me to believe that bromwell highs satire is much closer to reality than is teachers the scramble to survive financially the insightful students who can see right through their pathetic teachers pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled  at  high a classic line inspector im here to sack one of your teachers student welcome to bromwell high i expect that many adults of my age think that bromwell high is far fetched what a pity that it isnt'

In [7]:
# Removing englist language stop words using nltk
nltk.download('stopwords')
english_stop_words = stopwords.words('english')
def remove_stop_words(corpus):
    removed_stop_words = []
    for review in corpus:
        removed_stop_words.append(
            ' '.join([word for word in review.split() 
                      if word not in english_stop_words])
        )
    return removed_stop_words

no_stop_words = remove_stop_words(reviews_train_clean)


[nltk_data] Downloading package stopwords to /home/neo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
no_stop_words[0]

'bromwell high cartoon comedy ran time programs school life teachers 35 years teaching profession lead believe bromwell highs satire much closer reality teachers scramble survive financially insightful students see right pathetic teachers pomp pettiness whole situation remind schools knew students saw episode student repeatedly tried burn school immediately recalled high classic line inspector im sack one teachers student welcome bromwell high expect many adults age think bromwell high far fetched pity isnt'

In [9]:
# Lemmatize the training data for root words
nltk.download('wordnet')
def get_lemmatized_text(corpus):
    lemmatizer = WordNetLemmatizer()
    return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]

lemmatized_reviews = get_lemmatized_text(no_stop_words)

[nltk_data] Downloading package wordnet to /home/neo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
lemmatized_reviews[0]

'bromwell high cartoon comedy ran time program school life teacher 35 year teaching profession lead believe bromwell high satire much closer reality teacher scramble survive financially insightful student see right pathetic teacher pomp pettiness whole situation remind school knew student saw episode student repeatedly tried burn school immediately recalled high classic line inspector im sack one teacher student welcome bromwell high expect many adult age think bromwell high far fetched pity isnt'

In [None]:
# for sentiment analysis on this corpus we will use sklearn library
# specifically ngram_range to implement ngram sentiment analysis.


In [10]:
# We will transform each text to a vector on the basis of the frequency (count) of each word 
# that occurs in the entire text
# ngram = (1,2)
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
ngram_vectorizer.fit(lemmatized_reviews)
X = ngram_vectorizer.transform(lemmatized_reviews)
X_test = ngram_vectorizer.transform(reviews_test_clean)

In [12]:
# train and val split
target = [1 if i < 12500 else 0 for i in range(25000)]

X_train, X_val, y_train, y_val = train_test_split(X, target, train_size = 0.80)

In [12]:
# with logistic regression
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))

Accuracy for C=0.01: 0.8812
Accuracy for C=0.05: 0.8898
Accuracy for C=0.25: 0.8906
Accuracy for C=0.5: 0.8902
Accuracy for C=1: 0.8912


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
# final linear model 
final_tfidf = LogisticRegression(C=0.25)
final_tfidf.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_tfidf.predict(X_test)))

Final Accuracy: 0.88076


In [15]:
# with SVM
from sklearn.svm import LinearSVC

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))

Accuracy for C=0.01: 0.8912
Accuracy for C=0.05: 0.8912
Accuracy for C=0.25: 0.8912
Accuracy for C=0.5: 0.8912
Accuracy for C=1: 0.8912




In [16]:
# checking the accuracy
final = LinearSVC(C=0.01)
final.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final.predict(X_test)))

Final Accuracy: 0.88172


In [None]:
# if we change the n-gram to 1,3

In [18]:
# ngram = (1,3)
stop_words = ['in', 'of', 'at', 'a', 'the']
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 3), stop_words = english_stop_words)
ngram_vectorizer_custom_stop = CountVectorizer(binary=True, ngram_range=(1, 3), stop_words = stop_words)

ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)
X_test = ngram_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.001, 0.005, 0.01, 0.05, 0.1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))

    
final = LinearSVC(C=0.01)
final.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final.predict(X_test)))

Accuracy for C=0.001: 0.87152
Accuracy for C=0.005: 0.88224
Accuracy for C=0.01: 0.88272
Accuracy for C=0.05: 0.88208
Accuracy for C=0.1: 0.88272
Final Accuracy: 0.88916


In [19]:
# custom stop words 
ngram_vectorizer_custom_stop.fit(reviews_train_clean)
X = ngram_vectorizer_custom_stop.transform(reviews_train_clean)
X_test = ngram_vectorizer_custom_stop.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.001, 0.005, 0.01, 0.05, 0.1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))

Accuracy for C=0.001: 0.88448
Accuracy for C=0.005: 0.88768
Accuracy for C=0.01: 0.8888
Accuracy for C=0.05: 0.88928
Accuracy for C=0.1: 0.88864


In [20]:
# SVM with ngram = (1,3)
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 3))
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)
X_test = ngram_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.001, 0.005, 0.01, 0.05, 0.1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))

    
final = LinearSVC(C=0.01)
final.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final.predict(X_test)))

Accuracy for C=0.001: 0.89072
Accuracy for C=0.005: 0.89488
Accuracy for C=0.01: 0.89488
Accuracy for C=0.05: 0.89536
Accuracy for C=0.1: 0.89488
Final Accuracy: 0.89864


In [16]:
# using ngram = (1,4)
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 4))
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)
X_test = ngram_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

print("Result of SVM")
for c in [0.001, 0.005, 0.01, 0.05, 0.1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))

    
#final = LinearSVC(C=0.01)
#final.fit(X, target)
#print ("Final Accuracy: %s" 
#       % accuracy_score(target, final.predict(X_test)))



Result of SVM
Accuracy for C=0.001: 0.88736
Accuracy for C=0.005: 0.89312
Accuracy for C=0.01: 0.89376
Accuracy for C=0.05: 0.89328
Accuracy for C=0.1: 0.89296


In [13]:
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 4))
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)
X_test = ngram_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

In [None]:
print("Result of logistic")
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))
    
#final_tfidf = LogisticRegression(C=0.25)
#final_tfidf.fit(X, target)
#print ("Final Accuracy: %s" 
#       % accuracy_score(target, final_tfidf.predict(X_test)))    

Result of logistic
Accuracy for C=0.01: 0.8872
Accuracy for C=0.05: 0.89136
Accuracy for C=0.25: 0.89392


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy for C=0.5: 0.89392
