In [1]:
import pandas as pd
import numpy as np
import string
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm, tree
from sklearn.metrics import accuracy_score
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier

In [2]:
np.random.seed(500)

In [3]:
imdb_data = pd.read_csv(r"C:\Users\stezo\Desktop\Text_classification\IMDB Dataset.csv",encoding='latin-1')

In [None]:
"""# Usunięcie pustych rekordów.
imdb_data['text'].dropna(inplace=True)

# Zmiana wszystkich liter na małe.
imdb_data['text'] = [entry.lower() for entry in imdb_data['text']]

# Step - c : Tokenization : In this each entry in the corpus will be broken into set of words
imdb_data['text']= [word_tokenize(entry) for entry in imdb_data['text']]

# Usunięcie słów stopu, znaków niealfanumerycznych, lematyzacja tekstu.
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(imdb_data['text']):
    Final_words = []
    word_Lemmatized = WordNetLemmatizer()
    for word, tag in pos_tag(entry):
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    imdb_data.loc[index,'text_final'] = str(Final_words)
"""

In [4]:
def clean_text(text ): 
    delete_dict = {sp_character: '' for sp_character in string.punctuation} 
    delete_dict[' '] = ' ' 
    table = str.maketrans(delete_dict)
    text1 = str(text).translate(table)
    #print('cleaned:'+text1)
    textArr= text1.split()
    text2 = ' '.join([w for w in textArr if ( not w.isdigit() and  ( not w.isdigit() and len(w)>2))]) 
    
    return text2.lower()

In [5]:
imdb_data['text'].dropna(axis = 0, how ='any',inplace=True) 

imdb_data['text'] = imdb_data['text'].apply(clean_text)

In [6]:
imdb_data

Unnamed: 0,text,label
0,one the other reviewers has mentioned that aft...,positive
1,wonderful little production the filming techni...,positive
2,thought this was wonderful way spend time too ...,positive
3,basically theres family where little boy jake ...,negative
4,petter matteis love the time money visually st...,positive
...,...,...
49995,thought this movie did down right good job was...,positive
49996,bad plot bad dialogue bad acting idiotic direc...,negative
49997,catholic taught parochial elementary schools n...,negative
49998,going have disagree with the previous comment ...,negative


In [118]:
# Podział na zbiór trenningowy oraz testowy.
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(imdb_data['text'],
                                                                    imdb_data['label'],
                                                                    train_size=0.9,
                                                                    test_size=0.1)

# Kodowanie etykiety
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

# Wektoryzacja tekstu
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(imdb_data['text'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [105]:
print(list(Tfidf_vect.vocabulary_.items())[:5])

[('one', 3057), ('the', 4405), ('other', 3086), ('reviewers', 3643), ('has', 2000)]


In [106]:
print("Size of the training set: ", Train_X_Tfidf.shape[0])
print("Size of the training set: ", Test_X_Tfidf.shape[0])

Size of the training set:  25000
Size of the training set:  15000


In [59]:
# Klasyfikator liniowy
Linear = RidgeClassifier(alpha=1, 
                         fit_intercept=True, 
                         normalize='deprecated', 
                         copy_X=True, 
                         max_iter=None, 
                         tol=0.001, 
                         class_weight=None, 
                         solver='auto', 
                         random_state=None)

%time Linear.fit(Train_X_Tfidf,Train_Y)

predictions_Ridge = Linear.predict(Test_X_Tfidf)

print("Ridge Classifier Accuracy Score -> ",accuracy_score(predictions_Ridge, Test_Y))

Wall time: 243 ms
Ridge Classifier Accuracy Score ->  0.8862


In [60]:
# Niwny klasyfikator Bayesowski
Naive = naive_bayes.MultinomialNB(alpha=10, 
                                  fit_prior=True, 
                                  class_prior=None)

%time Naive.fit(Train_X_Tfidf,Train_Y)

predictions_NB = Naive.predict(Test_X_Tfidf)

print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y))

Wall time: 26.9 ms
Naive Bayes Accuracy Score ->  0.8442


In [61]:
# Maszyna wektorów nośnych
SVM = svm.SVC(C=0.5, 
              kernel='linear', 
              degree=3, 
              gamma='auto', 
              coef0=0.0, 
              shrinking=True, 
              probability=False, 
              tol=0.001, 
              cache_size=200, 
              class_weight=None, 
              verbose=False, 
              max_iter=- 1, 
              decision_function_shape='ovr', 
              break_ties=False, 
              random_state=None)

%time SVM.fit(Train_X_Tfidf,Train_Y)

predictions_SVM = SVM.predict(Test_X_Tfidf)

print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y))

Wall time: 14min 13s
SVM Accuracy Score ->  0.8866


In [119]:
# Drzewa decyzyjne
Tree = tree.DecisionTreeClassifier(criterion='entropy', 
                                   splitter='best',
                                   max_features='auto',
                                   max_depth=None, 
                                   min_samples_split=2, 
                                   min_samples_leaf=1, 
                                   min_weight_fraction_leaf=0.0,  
                                   random_state=None, 
                                   max_leaf_nodes=None, 
                                   min_impurity_decrease=0.0, 
                                   class_weight=None, 
                                   ccp_alpha=0.0)

%time Tree.fit(Train_X_Tfidf,Train_Y)

predictions_Tree = Tree.predict(Test_X_Tfidf)

print("Decision Tree Accuracy Score -> ",accuracy_score(predictions_Tree, Test_Y))

Wall time: 1.81 s
Decision Tree Accuracy Score ->  0.6734


In [120]:
# Losowy Las
Forest = RandomForestClassifier(n_estimators=1000,
                                criterion='entropy',
                                max_features='auto',
                                n_jobs=-1,
                                random_state=None,
                                verbose=1)

%time Forest.fit(Train_X_Tfidf,Train_Y)

predictions_Forest = Forest.predict(Test_X_Tfidf)

print("Random Forest Accuracy Score -> ",accuracy_score(predictions_Forest, Test_Y))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   32.3s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  3.6min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.0s


Wall time: 3min 34s


[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    0.2s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:    0.4s


Random Forest Accuracy Score ->  0.8526


[Parallel(n_jobs=12)]: Done 1000 out of 1000 | elapsed:    0.5s finished
