In [1]:
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split

import nltk 
import nltk.tokenize

import re

In [2]:
df = pd.read_csv('imdb.csv', encoding = "ISO-8859-1")
print(df.head())

                                                text  score
0  A very, very, very slow-moving, aimless movie ...      0
1  Not sure who was more lost - the flat characte...      0
2  Attempting artiness with black & white and cle...      0
3       Very little music or anything to speak of.        0
4  The best scene in the movie was when Gerardo i...      1


In [3]:
def count_aa_text(text):
    a_count = 0
    for element in text:
        stop_count = 1
        lower_text = element.lower()
        tokenized_text = nltk.tokenize.word_tokenize(lower_text)
        pos_tag_text = nltk.pos_tag(tokenized_text, tagset='universal')
        for x in pos_tag_text:
            token = x[0]
            if token[0]=='.': # I wonder how elipses are tokenized???
                stop_count = stop_count + 1
            tag = x[1]
            if tag=='ADJ' or tag == 'ADV':
                a_count = a_count + 1
            ratio = a_count/stop_count
        return ratio


print(df.head())

                                                text  score
0  A very, very, very slow-moving, aimless movie ...      0
1  Not sure who was more lost - the flat characte...      0
2  Attempting artiness with black & white and cle...      0
3       Very little music or anything to speak of.        0
4  The best scene in the movie was when Gerardo i...      1


In [4]:
#list of features
features= ['text']#,'a_count']
#target variable
target = 'score'

#arrays and lists NOT dataframes and series
X = df[features].values
y = df[target].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)
#print(X_train)

ratio_of_pos_to_neg = sum(y_train)/len(y_train)
print(ratio_of_pos_to_neg)

0.525089605734767


In [5]:
class TextStats(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self
 #try to add in my function here 
    def transform(self, posts):
        return [{#'length': len(text),
                 #'num_sentences': text.count('.'),
                'aa_ratio': count_aa_text(text)}
                for text in posts]

In [6]:
text_stats_pipeline_with_clf = Pipeline([
                ('stats', TextStats()),  # returns a list of dicts
                ('vect', DictVectorizer()),  # list of dicts -> feature matrix
                ('classifier', LogisticRegression(solver = 'liblinear', class_weight = 'balanced'))
            ])

In [7]:
text_stats_pipeline_with_clf.fit(X_train, y_train)
print(text_stats_pipeline_with_clf.score(X_test, y_test))

0.5240641711229946


In [8]:
y_pred = text_stats_pipeline_with_clf.predict(X_test)
y_pred_proba = text_stats_pipeline_with_clf.predict_proba(X_test)

In [9]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.60      0.53      0.57       109
           1       0.44      0.51      0.47        78

   micro avg       0.52      0.52      0.52       187
   macro avg       0.52      0.52      0.52       187
weighted avg       0.54      0.52      0.53       187

