# SKLearn Example

## Imports

In [40]:
import os
import re

import sklearn
import pandas as pd
import nltk

## Load and parse data

### Raw data parsing

In [5]:
paths = {
    "train_pos": "../data/raw/aclImdb/train/pos/",
    "train_neg": "../data/raw/aclImdb/train/neg/",
    "test_pos": "../data/raw/aclImdb/test/pos/",
    "test_neg": "../data/raw/aclImdb/test/neg/"
}    

In [70]:
# reviews = pd.Series([], name="REVIEW")
# score = pd.Series([], name="SCORE")

reviews = []
score = []

for path in paths.values():
    files = os.listdir(path)
    
    for file in files:
        file_score = re.search('\_([0-9]{1,2})\.', file).group(1)
        with open(path+file) as file_buffer:
            file_data = file_buffer.readlines()
            file_data = "".join(file_data)
        reviews.append(file_data)
        score.append(file_score)

        
dataset_as_dataframe = pd.DataFrame()
dataset_as_dataframe['REVIEW'] = reviews
dataset_as_dataframe['SCORE'] = score

In [71]:
dataset_as_dataframe

Unnamed: 0,REVIEW,SCORE
0,Dark Angel is a cross between Huxley's Brave N...,10
1,A serious comedy. Ross Hunter-produced movie v...,7
2,Those childhood memories...when things were ne...,10
3,This film lingered and lingered at a small mov...,10
4,Very interesting. The big twist wasn't as big ...,7
5,I would have rated the series a perfect 10 for...,9
6,My rating refers to the first 4 Seasons of Sta...,8
7,After the initial shock of realizing the guts ...,10
8,The Matador is a strange film. Its main charac...,7
9,"Somehow, this movie manages to be invigorating...",10


In [72]:
dataset_as_dataframe.to_csv("../data/processed/acllib_data.csv", index=False)

### Loading from Processed data

In [82]:
usable_dataset = pd.read_csv("../data/processed/acllib_data.csv")
usable_dataset = usable_dataset.sample(frac=1).reset_index(drop=True)

## Pre processing Data

In [83]:
tokenizer = nltk.tokenize.RegexpTokenizer("[a-z]+")

def pre_processing(review):
    review = review.lower()
    review = tokenizer.tokenize(review)
    return " ".join(review)

In [84]:
usable_dataset["REVIEW"] = usable_dataset["REVIEW"].map(pre_processing)
usable_dataset["SCORE"] = usable_dataset["SCORE"].map(lambda x: 0 if x <5 else 1)
usable_dataset

Unnamed: 0,REVIEW,SCORE
0,this movie has a look and feel of many fresh d...,0
1,as i watched one of orson welles last contribu...,1
2,i just watched antwone fisher on bravo what an...,1
3,i couldn t believe how bad this film was and t...,0
4,the japanese cyber punk films have never reall...,1
5,four stories written by robert bloch about var...,1
6,whipped is minutes long this review is words l...,0
7,sergio martino has impressed me recently with ...,1
8,i watched this because a friend told me it was...,1
9,i was so offended by this film that i had to w...,0


## Train-test Split

In [90]:
train_dataframe, test_dataframe = sklearn.model_selection.train_test_split(usable_dataset,
                                             test_size=0.33,
                                             shuffle=True)

In [93]:
train_X = train_dataframe['REVIEW']
train_y = train_dataframe['SCORE']
test_X = test_dataframe['REVIEW']
test_y = test_dataframe['SCORE']

## Feature Extraction (TF-IDF)

In [94]:
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(stop_words='english', max_features=200)

In [96]:
vectorizer.fit(train_dataframe['REVIEW'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=200, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [97]:
vectorizer.transform(usable_dataset.loc[0,'REVIEW'])

<1x200 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>