In [1]:
import re 
import csv
import json
import nltk
import sklearn
import pandas as pd
from pprint import pprint
from bs4 import BeautifulSoup 
import matplotlib.pyplot as plt
pd.set_option("display.max_colwidth",-1)

def rescue_code(function):
    import inspect
    get_ipython().set_next_input("".join(inspect.getsourcelines(function)[0]))
    
def tic():
    #Homemade version of matlab tic and toc functions
    import time
    global startTime_for_tictoc
    startTime_for_tictoc = time.time()

def toc():
    import time
    if 'startTime_for_tictoc' in globals():
        print("Elapsed time is " + str(time.time() - startTime_for_tictoc) + " seconds.")
    else:
        print("Toc: start time not set")


### Read data from json file and Merge them into Pandas Data Frame


In [2]:
with open('drama_love_reviews_100.json') as json_data:
    posReviewsDrama = json.load(json_data)
    
with open('drama_hate_reviews_100.json') as json_data:
    negReviewsDrama = json.load(json_data)
    
with open('action_love_reviews_100.json') as json_data:
    posReviewsAction = json.load(json_data)
    
with open('action_hate_reviews_100.json') as json_data:
    negReviewsAction  = json.load(json_data)
    
with open('horror_hate_reviews_100.json') as json_data:
    negReviewsHorror  = json.load(json_data) 
    
with open('horror_love_reviews_100.json') as json_data:
    posReviewsHorror  = json.load(json_data) 
    
with open('romance_hate_reviews_100.json') as json_data:
    negReviewsRomance  = json.load(json_data)     
    
with open('romance_love_reviews_100.json') as json_data:
    posReviewsRomance  = json.load(json_data) 
    
with open('Sci-Fi_love_reviews_100.json') as json_data:
    posReviewsSci  = json.load(json_data) 
    
with open('Sci-Fi_hate_reviews_100.json') as json_data:
    negReviewsSci  = json.load(json_data) 

In [3]:
posReList = []
for reviewlist in posReviewsDrama + posReviewsHorror + posReviewsAction + posReviewsSci + posReviewsRomance:
    if reviewlist is not None:
        for review in reviewlist:
            posReList.append([review['review'],review['categorie']])
            
posReviews = pd.DataFrame(posReList,columns=['reviews', 'sentiment'])
posReviews = posReviews.drop_duplicates()

negReList = []
for reviewlist in  negReviewsAction + negReviewsHorror +negReviewsDrama +negReviewsRomance + negReviewsSci:
    if reviewlist is not None:
        for review in reviewlist:
            negReList.append([review['review'],review['categorie']])
            
negReviews = pd.DataFrame(negReList,columns=['reviews', 'sentiment'])
negReviews = negReviews.drop_duplicates()



In [4]:
print(posReviews.shape)
print(negReviews.shape)

(25819, 2)
(22866, 2)


In [4]:
from sklearn.model_selection import train_test_split

pos_train, pos_test = train_test_split(posReviews, test_size = 0.3)
neg_train, neg_test = train_test_split(negReviews, test_size = 0.3)

train = pos_train.append(neg_train, ignore_index=True)
test = pos_test.append(neg_test, ignore_index=True)

train = train.reset_index(drop=True)
train['id'] = train.index


test = test.reset_index(drop=True)
test['id'] = test.index

print(train.shape)
print(test.shape)

(34079, 3)
(14606, 3)


### Text Cleaning and Text Prepocessing

+ Convert the words to lower case
+ Remove punctuation / Remove Numbers (Any non-letters one)
  NB: emoji could be uesd to analysis sentiment as well in the case
+ Remove stopwords
+ Steaming words 
+ Split the words

In [5]:
#train['reviews'] = train['reviews'].apply(lambda x : x.lower())
#test['reviews'] = test['reviews'].apply(lambda x : x.lower())
#re.sub("[^a-zA-Z]",' ',train.ix[0,'reviews'])

def textPrepocess(movieReviews):
    movieReviews = movieReviews.lower()
    movieReviews = re.sub("[^a-zA-Z]",' ',movieReviews)
    movieReviews = (" ".join(movieReviews.split()))
    return movieReviews

train['reviews'] = train['reviews'].apply(textPrepocess)
test['reviews'] = test['reviews'].apply(textPrepocess)

Now that we have our training reviews tidied up, how do we convert them to some kind of numeric representation for machine learning? One common approach is called a Bag of Words. The Bag of Words model learns a vocabulary from all of the documents, then models each document by counting the number of times each word appears. For example, consider the following two sentences:

1. Titles :             no titles
2. Features Extraction: CountVectorizer()
3. Stopwords :          Not remove
4. Classifier :         Randomforest
5. N-gram:              (1,1)

In [9]:
# word| CountVectorizer|random forest 100|min_df=1
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = "word" ,min_df=1)
train_features = vectorizer.fit_transform(train['reviews'])
train_features = train_features.toarray()
vocabulary = vectorizer.get_feature_names()

tic()
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest_100 = RandomForestClassifier(n_estimators = 100) 

test_features = vectorizer.transform(test['reviews'])
test_features = test_features.toarray()

# Use the random forest to make sentiment label predictions
# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest_100 = forest_100.fit(train_features, train["sentiment"])
result = forest_100.predict(test_features)
toc()

from sklearn.metrics import classification_report

target_names = ['postive', 'negative']
print(classification_report(test['sentiment'], result, target_names=target_names))


Elapsed time is 936.9531109333038 seconds.
             precision    recall  f1-score   support

    postive       0.80      0.69      0.74      6860
   negative       0.75      0.85      0.80      7746

avg / total       0.78      0.77      0.77     14606



In [13]:
# word| CountVectorizer|random forest 100|min_df=5
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_mindf_5 = CountVectorizer(analyzer = "word" ,min_df=5)
train_features = vectorizer_mindf_5.fit_transform(train['reviews'])
train_features = train_features.toarray()
vocabulary = vectorizer_mindf_5.get_feature_names()

tic()
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest_100 = RandomForestClassifier(n_estimators = 100) 

test_features = vectorizer_mindf_5.transform(test['reviews'])
test_features = test_features.toarray()

# Use the random forest to make sentiment label predictions
# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest_100 = forest_100.fit(train_features, train["sentiment"])
result_100_mindf_5 = forest_100.predict(test_features)
toc()


from sklearn.metrics import classification_report

target_names = ['postive', 'negative']
print(classification_report(test['sentiment'], result_100_mindf_5, target_names=target_names))


             precision    recall  f1-score   support

    postive       0.79      0.70      0.74      6860
   negative       0.76      0.83      0.79      7746

avg / total       0.77      0.77      0.77     14606



In [8]:
# word| CountVectorizer|random forest 200|min_df=1
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = "word" ,min_df=1)
train_features = vectorizer.fit_transform(train['reviews'])
train_features = train_features.toarray()
vocabulary = vectorizer.get_feature_names()

tic()
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest_200 = RandomForestClassifier(n_estimators = 200) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
# This may take a few minutes to run

forest_200 = forest_200.fit(train_features, train["sentiment"])

test_features = vectorizer.transform(test['reviews'])
test_features = test_features.toarray()

# Use the random forest to make sentiment label predictions
result_200 = forest_200.predict(test_features)

from sklearn.metrics import classification_report
target_names = ['postive', 'negative']
print(classification_report(test['sentiment'], result_200, target_names=target_names))

toc()

             precision    recall  f1-score   support

    postive       0.82      0.69      0.75      6860
   negative       0.76      0.87      0.81      7746

avg / total       0.79      0.78      0.78     14606

Elapsed time is 4244.149746179581 seconds.


In [10]:
# word| TfidfVectorizer|random forest 100|min_df=1
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

vectorizer = CountVectorizer(analyzer = "word" ,min_df=1)
train_features_counts = vectorizer.fit_transform(train['reviews'])
tfidf_transformer = TfidfTransformer()
train_features_tfidf = tfidf_transformer.fit_transform(train_features_counts)
train_features_tfidf = train_features_tfidf.toarray()

vocabulary = vectorizer.get_feature_names()

test_features_counts = vectorizer.transform(test['reviews'])
test_features_tfidf = tfidf_transformer.transform(test_features_counts)
test_features_tfidf = test_features_tfidf.toarray()
tic()
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest_100 = RandomForestClassifier(n_estimators = 100) 


# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest_100_tf = forest_100.fit(train_features_tfidf, train["sentiment"])
# Use the random forest to make sentiment label predictions
result_100_tf = forest_100_tf.predict(test_features_tfidf)

from sklearn.metrics import classification_report
target_names = ['postive', 'negative']
print(classification_report(test['sentiment'], result_100_tf, target_names=target_names))

toc()

             precision    recall  f1-score   support

    postive       0.80      0.70      0.75      6860
   negative       0.76      0.85      0.80      7746

avg / total       0.78      0.78      0.78     14606

Elapsed time is 981.7485661506653 seconds.


In [11]:
# word| TfidfVectorizer|random forest 200|min_df=1
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

vectorizer = CountVectorizer(analyzer = "word" ,min_df=1)
train_features_counts = vectorizer.fit_transform(train['reviews'])
tfidf_transformer = TfidfTransformer()
train_features_tfidf = tfidf_transformer.fit_transform(train_features_counts)
train_features_tfidf = train_features_tfidf.toarray()

vocabulary = vectorizer.get_feature_names()

test_features_counts = vectorizer.transform(test['reviews'])
test_features_tfidf = tfidf_transformer.transform(test_features_counts)
test_features_tfidf = test_features_tfidf.toarray()

tic()
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest_200 = RandomForestClassifier(n_estimators = 200) 


# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest_200_tf = forest_200.fit(train_features_tfidf, train["sentiment"])
# Use the random forest to make sentiment label predictions
result_200_tf = forest_200_tf.predict(test_features_tfidf)

from sklearn.metrics import classification_report
target_names = ['postive', 'negative']
print(classification_report(test['sentiment'], result_200_tf, target_names=target_names))

toc()

             precision    recall  f1-score   support

    postive       0.81      0.70      0.75      6860
   negative       0.76      0.86      0.81      7746

avg / total       0.79      0.78      0.78     14606

Elapsed time is 1851.3719260692596 seconds.


### bi-grams
Yes. That will generate many more features though: it might be important to apply some cut-off (for instance discard features such bi-grams or words that occur less than 5 times in your dataset) so as to not drown your classifier with too many noisy features.

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

bigram_vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 2), min_df=1)

train_features_bigram = bigram_vectorizer.fit_transform(train['reviews'])
train_features_bigram = train_features_bigram.toarray()


test_features_bigram = bigram_vectorizer.transform(test['reviews'])
test_features_bigram = test_features_bigram.toarray()

In [22]:
# bigram| counvectoriezer| random forest 100|min_df=1

tic()
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest_100 = RandomForestClassifier(n_estimators = 100) 

# Use the random forest to make sentiment label predictions
# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest_100_bigram = forest_100.fit(train_features_bigram, train["sentiment"])

result_100_bigram = forest_100_bigram.predict(test_features_bigram)
toc()

from sklearn.metrics import classification_report

target_names = ['postive', 'negative']
print(classification_report(test['sentiment'], result_100_bigram, target_names=target_names))


Elapsed time is 24.718613862991333 seconds.
             precision    recall  f1-score   support

    postive       0.69      0.61      0.65      6860
   negative       0.69      0.76      0.72      7746

avg / total       0.69      0.69      0.69     14606



In [None]:
# bigram| TfidfVectorizer| random forest 100|min_df=1
from sklearn.feature_extraction.text import TfidfVectorizer

bigram_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 2), min_df=1)

train_features_bigram = bigram_vectorizer.fit_transform(train['reviews'])
train_features_bigram = train_features_bigram.toarray()


test_features_bigram = bigram_vectorizer.transform(test['reviews'])
test_features_bigram = test_features_bigram.toarray()


tic()
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest_100 = RandomForestClassifier(n_estimators = 100) 

# Use the random forest to make sentiment label predictions
# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest_100_bigram = forest_100.fit(train_features_bigram, train["sentiment"])

result_100_bigram = forest_100_bigram.predict(test_features_bigram)
toc()

from sklearn.metrics import classification_report

target_names = ['postive', 'negative']
print(classification_report(test['sentiment'], result_100_bigram, target_names=target_names))

In [26]:
# bigram| counvectoriezer| random forest 200|min_df=1

tic()
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest_200 = RandomForestClassifier(n_estimators = 200) 

# Use the random forest to make sentiment label predictions
# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest_200_bigram = forest_200.fit(train_features_bigram, train["sentiment"])

result_200_bigram = forest_200_bigram.predict(test_features_bigram)
toc()

from sklearn.metrics import classification_report

target_names = ['postive', 'negative']
print(classification_report(test['sentiment'], result_200_bigram, target_names=target_names))

Elapsed time is 46.86614799499512 seconds.
             precision    recall  f1-score   support

    postive       0.72      0.61      0.66      6860
   negative       0.70      0.79      0.74      7746

avg / total       0.71      0.70      0.70     14606



In [24]:
# bigram| counvectoriezer| random forest 100|min_df=5

from sklearn.feature_extraction.text import CountVectorizer

bigram_vectorizer_mindf_5 = CountVectorizer(analyzer='char', ngram_range=(2, 2), min_df=5)

train_features_bigram_mindf_5 = bigram_vectorizer_mindf_5.fit_transform(train['reviews'])
train_features_bigram_mindf_5 = train_features_bigram_mindf_5.toarray()


test_features_bigram_mindf_5 = bigram_vectorizer_mindf_5.transform(test['reviews'])
test_features_bigram_mindf_5 = test_features_bigram_mindf_5.toarray()


In [25]:
tic()
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest_100 = RandomForestClassifier(n_estimators = 100) 

# Use the random forest to make sentiment label predictions
# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest_100_bigram_mindf_5 = forest_100.fit(train_features_bigram_mindf_5, train["sentiment"])

result_100_bigram_mindf_5 = forest_100_bigram_mindf_5.predict(test_features_bigram_mindf_5)
toc()

from sklearn.metrics import classification_report

target_names = ['postive', 'negative']
print(classification_report(test['sentiment'], result_100_bigram_mindf_5, target_names=target_names))


Elapsed time is 22.968708992004395 seconds.
             precision    recall  f1-score   support

    postive       0.69      0.60      0.64      6860
   negative       0.68      0.76      0.72      7746

avg / total       0.69      0.69      0.68     14606



In [27]:
# bigram| counvectoriezer| random forest 200|min_df=5

tic()
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest_200 = RandomForestClassifier(n_estimators = 200) 

# Use the random forest to make sentiment label predictions
# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest_200_bigram_mindf_5 = forest_200.fit(train_features_bigram_mindf_5, train["sentiment"])

result_200_bigram_mindf_5 = forest_200_bigram_mindf_5.predict(test_features_bigram_mindf_5)
toc()

from sklearn.metrics import classification_report

target_names = ['postive', 'negative']
print(classification_report(test['sentiment'], result_200_bigram_mindf_5, target_names=target_names))

Elapsed time is 46.12395191192627 seconds.
             precision    recall  f1-score   support

    postive       0.71      0.61      0.66      6860
   negative       0.69      0.78      0.73      7746

avg / total       0.70      0.70      0.70     14606



### Trigram ngram(2,3)

In [29]:
from sklearn.feature_extraction.text import CountVectorizer

trigram_vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 3), min_df=1)

train_features_trigram = trigram_vectorizer.fit_transform(train['reviews'])
train_features_trigram = train_features_trigram.toarray()


test_features_trigram = trigram_vectorizer.transform(test['reviews'])
test_features_trigram = test_features_trigram.toarray()

In [31]:
# Trigram| counvectoriezer| random forest 100|min_df=1

tic()
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest_100 = RandomForestClassifier(n_estimators = 100) 

# Use the random forest to make sentiment label predictions
# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest_100_trigram = forest_100.fit(train_features_trigram, train["sentiment"])

result_100_trigram = forest_100_trigram.predict(test_features_trigram)
toc()

from sklearn.metrics import classification_report

target_names = ['postive', 'negative']
print(classification_report(test['sentiment'], result_100_trigram, target_names=target_names))

Elapsed time is 116.61578106880188 seconds.
             precision    recall  f1-score   support

    postive       0.73      0.66      0.69      6860
   negative       0.72      0.79      0.75      7746

avg / total       0.73      0.73      0.73     14606



In [33]:
# Trigram| TfidfVectorizer| random forest 100|min_df=1

from sklearn.feature_extraction.text import CountVectorizer

trigram_vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 3), min_df=1)

train_features_trigram = trigram_vectorizer.fit_transform(train['reviews'])
train_features_trigram = train_features_trigram.toarray()


test_features_trigram = trigram_vectorizer.transform(test['reviews'])
test_features_trigram = test_features_trigram.toarray()

tic()
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest_100 = RandomForestClassifier(n_estimators = 100) 

# Use the random forest to make sentiment label predictions
# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest_100_trigram = forest_100.fit(train_features_trigram, train["sentiment"])

result_100_trigram = forest_100_trigram.predict(test_features_trigram)
toc()

from sklearn.metrics import classification_report

target_names = ['postive', 'negative']
print(classification_report(test['sentiment'], result_100_trigram, target_names=target_names))

Elapsed time is 139.3318588733673 seconds.
             precision    recall  f1-score   support

    postive       0.75      0.66      0.70      6860
   negative       0.73      0.80      0.76      7746

avg / total       0.74      0.73      0.73     14606



In [42]:
# Trigram| TfidfVectorizer| random forest 100|min_df=1

from sklearn.feature_extraction.text import CountVectorizer

trigram_vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 3), min_df=1)

train_features_trigram = trigram_vectorizer.fit_transform(train['reviews'])
train_features_trigram = train_features_trigram.toarray()


test_features_trigram = trigram_vectorizer.transform(test['reviews'])
test_features_trigram = test_features_trigram.toarray()

tic()
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest_200 = RandomForestClassifier(n_estimators = 200) 

# Use the random forest to make sentiment label predictions
# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest_200_trigram = forest_200.fit(train_features_trigram, train["sentiment"])

result_200_trigram = forest_200_trigram.predict(test_features_trigram)
toc()

from sklearn.metrics import classification_report

target_names = ['postive', 'negative']
print(classification_report(test['sentiment'], result_200_trigram, target_names=target_names))

Elapsed time is 211.45111799240112 seconds.
             precision    recall  f1-score   support

    postive       0.75      0.66      0.70      6860
   negative       0.73      0.81      0.77      7746

avg / total       0.74      0.74      0.74     14606



In [8]:
# Trigram| TfidfVectorizer| random forest 100|min_df=1
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

def bagsOfWordUnigramRF(vect_select,tree_Num,df_min):
    target_names = ['postive', 'negative']
    # 1: TfidfVectorizer
    if vect_select == 1:
        tf_vectorizer = TfidfVectorizer(analyzer='word',min_df=df_min)
        
        train_features_tf = tf_vectorizer.fit_transform(train['reviews'])
        train_features_tf = train_features_tf.toarray()


        test_features_tf = tf_vectorizer.transform(test['reviews'])
        test_features_tf = test_features_tf.toarray()
         
        print('Training..classifer...\n')
        tic()
        forest = RandomForestClassifier(n_estimators = tree_Num) 
        forest_model = forest.fit(train_features_tf, train["sentiment"])
        result = forest_model.predict(test_features_tf)
        toc()
        
        print('Feature extraction: tf-idf..','Random forest',(tree_Num),'..','Min df',df_min,'..\n')
        print(classification_report(test['sentiment'], result, target_names=target_names))
        
    elif vect_select == 2:
        
        count_vectorizer = CountVectorizer(analyzer='word',min_df=df_min)
        
        train_features_count = count_vectorizer.fit_transform(train['reviews'])
        train_features_count = train_features_count.toarray()


        test_features_count = count_vectorizer.transform(test['reviews'])
        test_features_count = test_features_count.toarray()
         
        print('Training..classifer...\n')
        tic()
        forest = RandomForestClassifier(n_estimators = tree_Num) 
        forest_model = forest.fit(train_features_count, train["sentiment"])
        result = forest_model.predict(test_features_count)
        toc()
        
        print('Feature extraction: tf-idf..','Random forest',(tree_Num),'..','Min df',df_min,'..\n')
        print(classification_report(test['sentiment'], result, target_names=target_names))


In [9]:
bagsOfWordUnigramRF(1,100,1)

Training..classifer...

Elapsed time is 959.305449962616 seconds.
Feature extraction: tf-idf.. Random forest 100 .. Min df 1 ..

             precision    recall  f1-score   support

    postive       0.79      0.70      0.74      6860
   negative       0.76      0.84      0.80      7746

avg / total       0.78      0.77      0.77     14606



In [None]:
bagsOfWordUnigramRF(1,100,3)

Training..classifer...

Elapsed time is 511.7078468799591 seconds.
Feature extraction: tf-idf.. Random forest 100 .. Min df 3 ..

             precision    recall  f1-score   support

    postive       0.80      0.70      0.75      6860
   negative       0.76      0.84      0.80      7746

avg / total       0.78      0.78      0.78     14606



In [None]:
bagsOfWordUnigramRF(1,200,1)

Training..classifer...



In [None]:
bagsOfWordUnigramRF(1,200,3)

In [None]:
# Trigram| TfidfVectorizer| random forest 100|min_df=1
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

def bagsOfWordNgramRF(vect_select,tree_Num,df_min,max_gram,min_gram):
    target_names = ['postive', 'negative']
    # 1: TfidfVectorizer
    if vect_select == 1:
        tf_vectorizer = TfidfVectorizer(analyzer='word',min_df=df_min,ngram_range=(min_gram,max_gram))
        
        train_features_tf = tf_vectorizer.fit_transform(train['reviews'])
        train_features_tf = train_features_tf.toarray()


        test_features_tf = tf_vectorizer.transform(test['reviews'])
        test_features_tf = test_features_tf.toarray()
         
        print('Training..classifer...\n')
        tic()
        forest = RandomForestClassifier(n_estimators = tree_Num) 
        forest_model = forest.fit(train_features_tf, train["sentiment"])
        result = forest_model.predict(test_features_tf)
        toc()
        
        print('Feature extraction: tf-idf..','Random forest',(tree_Num),'..','Min df',df_min,'..\n')
        print(classification_report(test['sentiment'], result, target_names=target_names))
        
    elif vect_select == 2:
        
        count_vectorizer = CountVectorizer(analyzer='word',min_df=df_min,ngram_range=(min_gram,max_gram))
        
        train_features_count = count_vectorizer.fit_transform(train['reviews'])
        train_features_count = train_features_count.toarray()


        test_features_count = count_vectorizer.transform(test['reviews'])
        test_features_count = test_features_count.toarray()
         
        print('Training..classifer...\n')
        tic()
        forest = RandomForestClassifier(n_estimators = tree_Num) 
        forest_model = forest.fit(train_features_count, train["sentiment"])
        result = forest_model.predict(test_features_count)
        toc()
        
        print('Feature extraction: tf-idf..','Random forest',(tree_Num),'..','Min df',df_min,'..\n')
        print(classification_report(test['sentiment'], result, target_names=target_names))
