# SKLearn Example

## Imports

In [1]:
import os
import re

import sklearn
import sklearn.ensemble
import pandas as pd
import nltk

import matplotlib.pyplot as plt

## Load and parse data

### Raw data parsing

In [2]:
paths = {
    "train_pos": "../data/raw/aclImdb/train/pos/",
    "train_neg": "../data/raw/aclImdb/train/neg/",
    "test_pos": "../data/raw/aclImdb/test/pos/",
    "test_neg": "../data/raw/aclImdb/test/neg/"
}    

In [3]:
# reviews = pd.Series([], name="REVIEW")
# score = pd.Series([], name="SCORE")

reviews = []
score = []

for path in paths.values():
    files = os.listdir(path)
    
    for file in files:
        file_score = re.search('\_([0-9]{1,2})\.', file).group(1)
        with open(path+file) as file_buffer:
            file_data = file_buffer.readlines()
            file_data = "".join(file_data)
        reviews.append(file_data)
        score.append(file_score)

        
dataset_as_dataframe = pd.DataFrame()
dataset_as_dataframe['REVIEW'] = reviews
dataset_as_dataframe['SCORE'] = score

In [4]:
dataset_as_dataframe

Unnamed: 0,REVIEW,SCORE
0,"I just saw this episode this evening, on a rec...",8
1,"I just saw ""Of Human Bondage"" for the first ti...",8
2,"Hey now, yours truly, TheatreX, found this whi...",8
3,This movie started me on a Nick Cage kick. It ...,9
4,Just Cause takes some of the best parts of thr...,7
5,This stylistically sophisticated visual game p...,10
6,<br /><br />In anticipation of Ang Lee's new m...,10
7,this is really films outside (not in a motel r...,10
8,Deliriously romantic comedy with intertwining ...,9
9,This movie grabbed me with the incredible open...,9


In [5]:
dataset_as_dataframe.to_csv("../data/processed/acllib_data.csv", index=False)

### Loading from Processed data

In [6]:
usable_dataset = pd.read_csv("../data/processed/acllib_data.csv")
usable_dataset = usable_dataset.sample(frac=1).reset_index(drop=True)

## Pre processing Data

In [7]:
tokenizer = nltk.tokenize.RegexpTokenizer("[a-z]+")

def cleanhtml(raw_html):
  cleanr = re.compile('<.*?>')
  cleantext = re.sub(cleanr, '', raw_html)
  return cleantext

def pre_processing(review):
    review = review.lower()
    review = cleanhtml(review)
    review = tokenizer.tokenize(review)
    return " ".join(review)

In [8]:
usable_dataset["REVIEW"] = usable_dataset["REVIEW"].map(pre_processing)
usable_dataset["SCORE"] = usable_dataset["SCORE"].map(lambda x: 0 if x <5 else 1)
usable_dataset

Unnamed: 0,REVIEW,SCORE
0,way back in one of the airliner pilots where i...,1
1,for the viewer who comes upon it long after it...,1
2,two funeral directors in a welsh village engli...,1
3,this movie blew me away i have only seen two e...,1
4,don t be deceived as i was by the glowing revi...,0
5,one of the great things about the best years o...,1
6,this film has little to recommend it though th...,0
7,the literary genius of vladimir navokov is bro...,1
8,the premise of the story is common enough aver...,1
9,this is a very entertaining flick considering ...,1


## Train-test Split

In [9]:
train_dataframe, test_dataframe = sklearn.model_selection.train_test_split(usable_dataset,
                                             test_size=0.33,
                                             shuffle=True)

In [10]:
train_X = train_dataframe['REVIEW']
train_y = train_dataframe['SCORE']
test_X = test_dataframe['REVIEW']
test_y = test_dataframe['SCORE']

## Feature Extraction (TF-IDF)

In [11]:
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(
    stop_words='english',
    max_features=200,
    ngram_range=(1,3)
)

### Passing features to encoder

In [12]:
vectorizer.fit(train_dataframe['REVIEW'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=200, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [13]:
vectorizer.get_feature_names()

['acting',
 'action',
 'actor',
 'actors',
 'actually',
 'american',
 'audience',
 'away',
 'bad',
 'beautiful',
 'believe',
 'best',
 'better',
 'big',
 'bit',
 'black',
 'book',
 'boring',
 'budget',
 'camera',
 'cast',
 'character',
 'characters',
 'classic',
 'come',
 'comedy',
 'comes',
 'completely',
 'couple',
 'course',
 'day',
 'dead',
 'death',
 'did',
 'didn',
 'different',
 'director',
 'does',
 'doesn',
 'don',
 'dvd',
 'effects',
 'end',
 'ending',
 'enjoy',
 'especially',
 'excellent',
 'fact',
 'family',
 'fan',
 'far',
 'father',
 'feel',
 'film',
 'films',
 'friends',
 'fun',
 'funny',
 'gets',
 'girl',
 'given',
 'goes',
 'going',
 'good',
 'got',
 'great',
 'guy',
 'half',
 'hard',
 'having',
 'help',
 'high',
 'hollywood',
 'home',
 'horror',
 'house',
 'idea',
 'instead',
 'interesting',
 'isn',
 'job',
 'john',
 'just',
 'kids',
 'kind',
 'know',
 'later',
 'left',
 'let',
 'life',
 'like',
 'line',
 'little',
 'll',
 'long',
 'look',
 'looking',
 'looks',
 'lot'

### Parsing inputs

In [14]:
transformed_train_x = vectorizer.transform(train_X)
transformed_test_x = vectorizer.transform(test_X)

## Training the Classifier

In [15]:
classifier = sklearn.ensemble.RandomForestClassifier()

In [16]:
classifier.fit(X=transformed_train_x, y=train_y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

## Evaluating Classifier

In [17]:
predicted_test = classifier.predict(X=transformed_test_x)
sklearn.metrics.confusion_matrix(y_true=test_y, y_pred=predicted_test)

array([[6389, 1760],
       [2743, 5608]])

In [19]:
metrics = sklearn.metrics.precision_recall_fscore_support(y_true=test_y, y_pred=predicted_test)
pd.DataFrame(list(metrics), index=['Precision', 'Recall', "F-Score", "Support"])

Unnamed: 0,0,1
Precision,0.699628,0.761129
Recall,0.784023,0.671536
F-Score,0.739425,0.713531
Support,8149.0,8351.0


In [21]:
fpr, tpr, thresholds = sklearn.metrics.roc_curve(y_true=test_y, y_score=predicted_test)
sklearn.metrics.auc(fpr, tpr)

0.7277794612052709