In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

import string
import collections

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
TRAIN_PATH  = '/content/gdrive/My Drive/Colab Notebooks/X_train.csv'
SUBMISSION_PATH =  '/content/gdrive/My Drive/Colab Notebooks/X_submission.csv'
X_train_origin = pd.read_csv(TRAIN_PATH)
X_submission = pd.read_csv(SUBMISSION_PATH)

In [4]:
# to visualize some data
X_temp = X_train_origin[['Score', 'Summary']]
X_temp['Summarylength'] = X_temp['Summary'].str.len()
X_temp = X_temp.drop('Summary', axis=1)
X_temp = X_temp.groupby('Score').mean()
print(X_temp)

       Summarylength
Score               
1.0        27.866070
2.0        29.092838
3.0        29.766138
4.0        29.162521
5.0        26.190694


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [5]:
X_temp = X_train_origin[['Score', 'Text']]
X_temp['Textlength'] = X_temp['Text'].str.len()
X_temp = X_temp.drop('Text', axis=1)
X_temp = X_temp.groupby('Score').mean()
print(X_temp)

        Textlength
Score             
1.0     862.809291
2.0    1123.463720
3.0    1122.747793
4.0    1100.062099
5.0     786.696716


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [6]:
X_temp = X_train_origin[['Score', 'Time']]
X_temp['date'] = pd.to_datetime(X_temp['Time'], unit='s')
X_temp['year'] = X_temp['date'].dt.year
X_temp = X_temp.drop(columns=['Time', 'date'])
X_temp = X_temp.groupby('year').mean()
print(X_temp)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


         Score
year          
1997  4.857143
1998  4.188011
1999  4.006810
2000  4.031556
2001  4.007905
2002  4.014383
2003  3.999198
2004  3.958226
2005  3.951324
2006  3.979884
2007  4.061298
2008  3.988038
2009  3.988523
2010  3.986099
2011  4.000012
2012  4.130792
2013  4.280746
2014  4.259809


In [7]:
X_temp = X_train_origin[['Score', 'Time']]
X_temp['date'] = pd.to_datetime(X_temp['Time'], unit='s')
X_temp['month'] = X_temp['date'].dt.month
X_temp = X_temp.drop(columns=['Time', 'date'])
X_temp = X_temp.groupby('month').mean()
print(X_temp)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


          Score
month          
1      4.150487
2      4.131141
3      4.125995
4      4.122719
5      4.110686
6      4.102390
7      4.101642
8      4.060489
9      4.069147
10     4.074341
11     4.090945
12     4.152789


In [8]:
X_temp = X_train_origin[['Score', 'HelpfulnessNumerator']]
X_temp = X_temp.groupby('Score').mean()
print(X_temp)

       HelpfulnessNumerator
Score                      
1.0                5.833487
2.0                3.429142
3.0                2.930144
4.0                3.353811
5.0                3.567425


In [9]:
X_temp = X_train_origin[['Score', 'HelpfulnessDenominator']]
X_temp = X_temp.groupby('Score').mean()
print(X_temp)

       HelpfulnessDenominator
Score                        
1.0                 15.621457
2.0                  7.626516
3.0                  4.902460
4.0                  4.280335
5.0                  4.383712


In [64]:
# preprocess
def get_nlp_score(ws_array_normalized, words_list, s):
  temp = np.array([0.0, 0.0, 0.0, 0.0, 0.0])
  find = False # if any word is in the vocabulary
  if type(s) != float:
    s = s.lower()
    exclude = set(string.punctuation)
    s = ''.join(ch for ch in s if ch not in exclude)
    words = s.split(' ')
    for word in words:
      if word in words_list:
        find = True
        temp = temp + ws_array_normalized[words_list.index(word)]
  else:
    return [0.0, 0.0, 0.0, 0.0, 0.0]
  if not find:
    return [0.0, 0.0, 0.0, 0.0, 0.0]
  total = temp.sum()
  temp_normalized = temp / total
  return temp_normalized.tolist()

def get_avg_score(x, dic, avg_all_scores):
  """ return the avg score stored in the dictionary """
  if x in dic:
    return dic[x]
  return avg_all_scores

def is_summary_length_long(s):
  """ take in a string s and return whether it is long"""
  if type(s) == float:
    return 0
  if len(s) > 28:
    return 1
  return 0

def is_text_length_long(s):
  if type(s) == float:
    return 0
  if len(s) > 1000:
    return 1
  return 0

def is_in_year_above_four(year):
  if year in [2003,2004,2005,2006,2008,2009,2010]:
    return 0
  return 1

def is_numerator_above_3(n):
  if n > 3:
    return 1
  return 0

def is_numerator_above_5(n):
  if n > 5:
    return 1
  return 0

def is_deno_above_7(n):
  if n > 7:
    return 1
  return 0

def is_deno_above_15(n):
  if n > 15:
    return 1
  return 0

def uniques_level(x, uniques):
  for i in range(len(uniques)):
    if x in uniques[i]:
      return i
  return 0 

def preprocess(dataframe, ws_array_normalized, words_list, ws_array_normalized2, words_list2, film_dic, user_dic, film_dic_std, user_dic_std, avg_all_scores, uniques_summary, uniques_text):
  temp = dataframe.drop(columns = ['Id', 'ProductId', 'UserId', 'Summary', 'Text', 'Score']) # drop unnecessary info

  temp['SummaryLength'] = dataframe['Summary'].str.len()
  temp['TextLength'] = dataframe['Text'].str.len()
  temp = (temp - temp.mean()) / temp.std()
  temp = temp.fillna(temp.mean())

  temp['nabove3'] = dataframe['HelpfulnessNumerator'].map(lambda x: is_numerator_above_3(x))
  temp['nabove5'] = dataframe['HelpfulnessNumerator'].map(lambda x: is_numerator_above_5(x))
  temp['dabove7'] = dataframe['HelpfulnessDenominator'].map(lambda x: is_deno_above_7(x))
  temp['dabove15'] = dataframe['HelpfulnessDenominator'].map(lambda x: is_deno_above_15(x))

  temp['year'] = pd.to_datetime(dataframe['Time'], unit='s').dt.year
  temp['isHighYear'] = temp['year'].map(lambda x: is_in_year_above_four(x))

  temp['SummaryLengthEval'] = dataframe['Summary'].map(lambda x: is_summary_length_long(x))
  temp['TextLengthEval'] = dataframe['Text'].map(lambda x: is_text_length_long(x))

  temp['film_avg'] = dataframe['ProductId'].map(lambda x: get_avg_score(x, film_dic, avg_all_scores))
  temp['user_avg'] = dataframe['UserId'].map(lambda x: get_avg_score(x, user_dic, avg_all_scores))
  temp['film_std'] = dataframe['ProductId'].map(lambda x: get_avg_score(x, film_dic_std, 0)).fillna(0)
  temp['user_std'] = dataframe['UserId'].map(lambda x: get_avg_score(x, user_dic_std, 0)).fillna(0)

  temp['unique_summary'] = dataframe['Summary'].map(lambda x: uniques_level(x, uniques_summary))
  temp['unique_text'] = dataframe['Text'].map(lambda x: uniques_level(x, uniques_text))

  temp['SummaryNLP'] = dataframe['Summary'].map(lambda x: get_nlp_score(ws_array_normalized, words_list, x))
  temp[['S1', 'S2', 'S3', 'S4', 'S5']] = pd.DataFrame(temp['SummaryNLP'].tolist(), index = temp.index)
  temp['TextNLP'] = dataframe['Summary'].map(lambda x: get_nlp_score(ws_array_normalized2, words_list2, x))
  temp[['t1', 't2', 't3', 't4', 't5']] = pd.DataFrame(temp['TextNLP'].tolist(), index = temp.index)

  temp = temp.drop(columns=['year', 'SummaryNLP', 'TextNLP'])
  temp['Score'] = dataframe['Score']
  return temp

  
def preprocess_nonnegative(dataframe, ws_array_normalized, words_list, ws_array_normalized2, words_list2, film_dic, user_dic, film_dic_std, user_dic_std, avg_all_scores, uniques_summary, uniques_text):
  ''' use different scaling to avoid negative numbers '''
  temp = dataframe.drop(columns = ['Id', 'ProductId', 'UserId', 'Summary', 'Text', 'Score']) # drop unnecessary info

  temp['SummaryLength'] = dataframe['Summary'].str.len()
  temp['TextLength'] = dataframe['Text'].str.len()
  temp = (temp - temp.min()) / temp.max()
  temp = temp.fillna(temp.mean())

  temp['nabove3'] = dataframe['HelpfulnessNumerator'].map(lambda x: is_numerator_above_3(x))
  temp['nabove5'] = dataframe['HelpfulnessNumerator'].map(lambda x: is_numerator_above_5(x))
  temp['dabove7'] = dataframe['HelpfulnessDenominator'].map(lambda x: is_deno_above_7(x))
  temp['dabove15'] = dataframe['HelpfulnessDenominator'].map(lambda x: is_deno_above_15(x))

  temp['year'] = pd.to_datetime(dataframe['Time'], unit='s').dt.year
  temp['isHighYear'] = temp['year'].map(lambda x: is_in_year_above_four(x))

  temp['SummaryLengthEval'] = dataframe['Summary'].map(lambda x: is_summary_length_long(x))
  temp['TextLengthEval'] = dataframe['Text'].map(lambda x: is_text_length_long(x))

  temp['film_avg'] = dataframe['ProductId'].map(lambda x: get_avg_score(x, film_dic, avg_all_scores))
  temp['user_avg'] = dataframe['UserId'].map(lambda x: get_avg_score(x, user_dic, avg_all_scores))
  temp['film_std'] = dataframe['ProductId'].map(lambda x: get_avg_score(x, film_dic_std, 0)).fillna(0)
  temp['user_std'] = dataframe['UserId'].map(lambda x: get_avg_score(x, user_dic_std, 0)).fillna(0)

  temp['unique_summary'] = dataframe['Summary'].map(lambda x: uniques_level(x, uniques_summary))
  temp['unique_text'] = dataframe['Text'].map(lambda x: uniques_level(x, uniques_text))

  temp['SummaryNLP'] = dataframe['Summary'].map(lambda x: get_nlp_score(ws_array_normalized, words_list, x))
  temp[['S1', 'S2', 'S3', 'S4', 'S5']] = pd.DataFrame(temp['SummaryNLP'].tolist(), index = temp.index)
  temp['TextNLP'] = dataframe['Summary'].map(lambda x: get_nlp_score(ws_array_normalized2, words_list2, x))
  temp[['t1', 't2', 't3', 't4', 't5']] = pd.DataFrame(temp['TextNLP'].tolist(), index = temp.index)

  temp = temp.drop(columns=['year', 'SummaryNLP', 'TextNLP'])
  temp['Score'] = dataframe['Score']
  return temp

In [26]:
all_train_scores = X_train_origin['Score'].to_list()
avg_all_scores = sum(all_train_scores) / len(all_train_scores) # the avg
print(avg_all_scores)

4.1114678508486024


In [27]:
film_scores_df = X_train_origin[['ProductId', 'Score']]
film_df = film_scores_df.groupby('ProductId').mean()
film_dic = film_df.to_dict()['Score']

#print(film_dic) 
print(len(film_dic))

50050


In [54]:
film_df_std = film_scores_df.groupby('ProductId').std()
film_dic_std = film_df_std.to_dict()['Score']
print(len(film_dic_std))

50050


In [28]:
users_scores_df = X_train_origin[['UserId', 'Score']]
user_df = users_scores_df.groupby('UserId').mean()
user_dic = user_df.to_dict()['Score']

#print(user_dic)
print(len(user_dic))

123958


In [56]:
user_df_std = users_scores_df.groupby('UserId').std()
user_dic_std = user_df_std.to_dict()['Score']
print(len(user_dic_std))

123958


In [None]:
# NLP
''' all_summaries = X_train_origin['Summary'].fillna('').to_list()
vectorizer = TfidfVectorizer(stop_words='english')
words = vectorizer.fit_transform(all_summaries)
word_list = vectorizer.get_feature_names()
count_list = np.asarray(words.sum(axis=0))[0]
dic = dict(zip(word_list, count_list))
the_vocab = list(dict(sorted(dic.items(), key=lambda item: item[1], reverse=True)))[:200] # 200 most common words
print(the_vocab) '''
''' with open("/content/gdrive/My Drive/Colab Notebooks/data/summary200words.txt", 'w') as f:
  f.write(' '.join(the_vocab)) '''

In [29]:
with open("/content/gdrive/My Drive/Colab Notebooks/data/summary200words.txt", 'r') as f:
  the_vocab = f.read().split(' ')
print(the_vocab)

documents_summary = []
documents_text = []
for score in [1,2,3,4,5]:
    df_temp = X_train_origin[X_train_origin['Score'] == score]
    str_list_temp = df_temp['Summary'].to_list()
    str_list_temp2 = df_temp['Text'].to_list()
    str_list_temp = [s for s in str_list_temp if type(s) != float]
    str_list_temp2 = [s for s in str_list_temp2 if type(s) != float]
    document_summary = ' '.join(str_list_temp)
    document_text = ' '.join(str_list_temp2)
    documents_summary.append(document_summary)
    documents_text.append(document_text)

vectorizer2 = TfidfVectorizer(stop_words='english')
words2 = vectorizer2.fit_transform(the_vocab)
words_score = vectorizer2.transform(documents_summary)
ws_array = np.array(words_score.toarray().T)
#print(ws_array)

row_sums = ws_array.sum(axis=1)
ws_array_normalized = ws_array / row_sums[:, np.newaxis]
#print(ws_array_normalized)

['movie', 'great', 'good', 'love', 'best', 'film', 'classic', 'fun', 'dvd', 'excellent', 'funny', 'series', 'stars', 'better', 'story', 'bad', 'time', 'awesome', 'season', 'movies', 'like', 'just', 'wonderful', 'entertaining', 'loved', 'watch', 'favorite', 'action', 'really', 'family', 'comedy', 'worth', 'ok', 'review', 'interesting', 'don', 'amazing', 'horror', 'fantastic', 'old', 'ray', 'blu', 'tv', 'nice', 'cute', 'pretty', 'wow', 'enjoyable', 'beautiful', 'collection', 'little', 'flick', 'original', 'star', 'hilarious', 'new', 'boring', 'expected', 'life', 'true', 'set', 'man', 'perfect', 'way', 'christmas', 'okay', 'films', 'buy', 'masterpiece', 'thriller', 'real', 'drama', 'version', 'brilliant', 'disappointing', 'fan', 'seen', 'quot', 'kids', 'finally', 'war', 'book', 'acting', 'quality', 'entertainment', 'watching', 'liked', 'worst', 've', 'fans', 'long', 'big', 'greatest', 'workout', 'outstanding', 'sequel', 'western', 'say', 'fi', 'video', 'sci', 'different', 'cool', 'gift', 

In [None]:
# do the same on text
''' all_texts = X_train_origin['Text'].fillna('').to_list()
vectorizer3 = TfidfVectorizer(stop_words='english')
words3 = vectorizer3.fit_transform(all_texts)
word_list2 = vectorizer3.get_feature_names()
count_list2 = np.asarray(words3.sum(axis=0))[0]
dic2 = dict(zip(word_list2, count_list2))
the_vocab2 = list(dict(sorted(dic2.items(), key=lambda item: item[1], reverse=True)))[:200] # 200 most common words
print(the_vocab2) '''
''' with open("/content/gdrive/My Drive/Colab Notebooks/data/text200words.txt", 'w') as f:
  f.write(' '.join(the_vocab2)) '''

In [30]:
with open("/content/gdrive/My Drive/Colab Notebooks/data/text200words.txt", 'r') as f:
  the_vocab2 = f.read().split(' ')
print(the_vocab2)

vectorizer4 = TfidfVectorizer(stop_words='english')
words4 = vectorizer4.fit_transform(the_vocab2)
words_score2 = vectorizer4.transform(documents_text)
ws_array2 = np.array(words_score2.toarray().T)

row_sums2 = ws_array2.sum(axis=1)
ws_array_normalized2 = ws_array2 / row_sums2[:, np.newaxis]

['movie', 'film', 'great', 'good', 'like', 'just', 'love', 'story', 'watch', 'dvd', 'really', 'movies', 'time', 'series', 'best', '34', 'season', 'quot', 'don', 'watching', 'better', 'did', 'funny', 'think', 'seen', 'acting', 'way', 'people', 'characters', 'action', 'little', 'make', 'fun', 'bad', 'family', 'loved', 'know', 'life', 'enjoyed', 'old', 'enjoy', 'recommend', 'films', 'watched', 'worth', 'say', 'character', 'plot', 've', 'does', 'new', 'end', 'years', 'actors', 'set', 'fan', 'didn', 'excellent', 'lot', 'want', 'times', 'scenes', 'cast', 'tv', 'man', 'original', 'thought', 'buy', 'got', 'classic', 'real', 'version', 'interesting', 'saw', 'going', 'collection', 'makes', 'favorite', 'music', 'pretty', 'long', 'quality', 'shows', 'wonderful', 'kids', 'special', 'liked', 'stars', 'bit', 'book', 'episodes', 'look', 'll', 'work', 'comedy', 'world', 'ray', 'thing', 'horror', 'things', 'star', 'young', 'big', 'bought', 'entertaining', 'right', 'scene', 'ending', 'video', 'feel', 'ep

In [31]:
most_frequent_words = []
current_score = 1
for document_summary in documents_summary:
  vectorizer_test = CountVectorizer(stop_words='english')
  words_test = vectorizer_test.fit_transform(document_summary.split('.'))
  words_bag = vectorizer_test.transform(document_summary.split('.'))
  sum_words = words_bag.sum(axis=0)
  words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer_test.vocabulary_.items()]
  words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
  print("For score " + str(current_score))
  print(words_freq[:100])
  current_score += 1
  most_frequent_words.append(words_freq[:100])

For score 1
[('movie', 8988), ('bad', 4636), ('worst', 3077), ('boring', 2598), ('waste', 2527), ('time', 2495), ('film', 2467), ('don', 2334), ('good', 1973), ('terrible', 1955), ('horrible', 1917), ('awful', 1917), ('dvd', 1836), ('just', 1731), ('like', 1504), ('money', 1484), ('really', 1326), ('star', 1325), ('great', 1280), ('stars', 1094), ('stupid', 1082), ('movies', 1081), ('watch', 1036), ('poor', 956), ('seen', 853), ('garbage', 837), ('buy', 820), ('disappointing', 740), ('did', 696), ('ve', 692), ('worth', 683), ('funny', 656), ('better', 653), ('review', 643), ('ray', 639), ('version', 623), ('crap', 612), ('blu', 610), ('story', 609), ('horror', 599), ('quality', 599), ('make', 551), ('way', 550), ('love', 548), ('oh', 546), ('series', 546), ('disappointment', 534), ('lame', 515), ('zero', 512), ('didn', 509), ('big', 509), ('worse', 504), ('disappointed', 493), ('acting', 481), ('quot', 479), ('watching', 457), ('sucks', 455), ('save', 443), ('dumb', 440), ('trash', 439

In [32]:
print('1:')
score1_100_words = [w for w, count in most_frequent_words[0]]
print(score1_100_words)
print('2:')
score2_100_words = [w for w, count in most_frequent_words[1]]
print(score2_100_words)
print('3:')
score3_100_words = [w for w, count in most_frequent_words[2]]
print(score3_100_words)
print('4:')
score4_100_words = [w for w, count in most_frequent_words[3]]
print(score4_100_words)
print('5:')
score5_100_words = [w for w, count in most_frequent_words[4]]
print(score5_100_words)

1:
['movie', 'bad', 'worst', 'boring', 'waste', 'time', 'film', 'don', 'good', 'terrible', 'horrible', 'awful', 'dvd', 'just', 'like', 'money', 'really', 'star', 'great', 'stars', 'stupid', 'movies', 'watch', 'poor', 'seen', 'garbage', 'buy', 'disappointing', 'did', 've', 'worth', 'funny', 'better', 'review', 'ray', 'version', 'crap', 'blu', 'story', 'horror', 'quality', 'make', 'way', 'love', 'oh', 'series', 'disappointment', 'lame', 'zero', 'didn', 'big', 'worse', 'disappointed', 'acting', 'quot', 'watching', 'sucks', 'save', 'dumb', 'trash', 'original', 'people', 'beware', 'best', 'real', 'season', 'hollywood', 'plot', 'release', 'want', 'hate', 'say', 'bother', 'wrong', 'total', 'does', 'wow', 'book', 'complete', 'comedy', 'minutes', 'mess', 'films', 'new', 'dull', 'life', 'avoid', 'old', 'absolutely', 'video', 'doesn', 'sad', 'let', 'slow', 'got', 'away', 'pointless', 'rip', 'couldn', 'man']
2:
['movie', 'good', 'bad', 'film', 'great', 'boring', 'disappointing', 'just', 'like', 'b

In [33]:
# unique words
print('1:')
uniques1_summary = [w for w in score1_100_words if (w not in score2_100_words and w not in score3_100_words and w not in score4_100_words and w not in score5_100_words )]
print(uniques1_summary)
print('2:')
uniques2_summary = [w for w in score2_100_words if (w not in score1_100_words and w not in score3_100_words and w not in score4_100_words and w not in score5_100_words )]
print(uniques2_summary)
print('3:')
uniques3_summary = [w for w in score3_100_words if (w not in score2_100_words and w not in score1_100_words and w not in score4_100_words and w not in score5_100_words )]
print(uniques3_summary)
print('4:')
uniques4_summary = [w for w in score4_100_words if (w not in score2_100_words and w not in score3_100_words and w not in score1_100_words and w not in score5_100_words )]
print(uniques4_summary)
print('5:')
uniques5_summary = [w for w in score5_100_words if (w not in score2_100_words and w not in score3_100_words and w not in score4_100_words and w not in score1_100_words )]
print(uniques5_summary)

1:
['garbage', 'crap', 'zero', 'worse', 'sucks', 'trash', 'beware', 'hollywood', 'release', 'want', 'hate', 'bother', 'wrong', 'total', 'minutes', 'mess', 'avoid', 'absolutely', 'sad', 'got', 'away', 'pointless', 'rip', 'couldn']
2:
['weak', 'overrated', 'script', 'silly', 'remake', 'potential', 'idea', 'effects']
3:
['half', 'short', 'mixed']
4:
['solid', 'surprisingly', 'tale', 'cool', 'enjoyed', 'fine']
5:
['favorite', 'amazing', 'fantastic', 'masterpiece', 'christmas', 'brilliant', 'greatest', 'finally', 'hilarious', 'outstanding', 'years', 'superb', 'disney', 'gift', 'heart', 'edition', 'music', 'day']


In [34]:
uniques_summary = [uniques1_summary, uniques2_summary, uniques3_summary, uniques4_summary, uniques5_summary]

In [24]:
most_frequent_words_text = []
current_score = 1
for document_text in documents_text:
  vectorizer_test = CountVectorizer(stop_words='english')
  words_test = vectorizer_test.fit_transform(document_text.split('.'))
  words_bag = vectorizer_test.transform(document_text.split('.'))
  sum_words = words_bag.sum(axis=0)
  words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer_test.vocabulary_.items()]
  words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
  print("For score " + str(current_score))
  print(words_freq[:200])
  current_score += 1
  most_frequent_words_text.append(words_freq[:200])

For score 1
For score 2
[('movie', 6593), ('good', 4146), ('bad', 2845), ('film', 2602), ('great', 2404), ('boring', 2165), ('disappointing', 1813), ('just', 1756), ('like', 1716), ('better', 1702), ('don', 1415), ('story', 1334), ('dvd', 1276), ('stars', 1255), ('time', 1226), ('really', 1151), ('poor', 1134), ('best', 953), ('ok', 920), ('funny', 843), ('watch', 824), ('waste', 792), ('review', 786), ('worst', 733), ('worth', 727), ('didn', 726), ('plot', 709), ('little', 701), ('disappointed', 697), ('slow', 677), ('love', 659), ('acting', 650), ('disappointment', 629), ('quot', 619), ('way', 615), ('series', 613), ('pretty', 590), ('original', 579), ('terrible', 579), ('interesting', 574), ('movies', 573), ('did', 563), ('comedy', 561), ('quality', 561), ('big', 557), ('money', 545), ('horror', 544), ('ray', 527), ('book', 524), ('weak', 521), ('blu', 516), ('action', 505), ('dull', 496), ('version', 488), ('long', 485), ('expected', 476), ('star', 471), ('doesn', 468), ('make', 45

In [36]:
print('1:')
score1_200_words = [w for w, count in most_frequent_words_text[0]]
print(score1_200_words)
print('2:')
score2_200_words = [w for w, count in most_frequent_words_text[1]]
print(score2_200_words)
print('3:')
score3_200_words = [w for w, count in most_frequent_words_text[2]]
print(score3_200_words)
print('4:')
score4_200_words = [w for w, count in most_frequent_words_text[3]]
print(score4_200_words)
print('5:')
score5_200_words = [w for w, count in most_frequent_words_text[4]]
print(score5_200_words)

1:
2:
['movie', 'good', 'bad', 'film', 'great', 'boring', 'disappointing', 'just', 'like', 'better', 'don', 'story', 'dvd', 'stars', 'time', 'really', 'poor', 'best', 'ok', 'funny', 'watch', 'waste', 'review', 'worst', 'worth', 'didn', 'plot', 'little', 'disappointed', 'slow', 'love', 'acting', 'disappointment', 'quot', 'way', 'series', 'pretty', 'original', 'terrible', 'interesting', 'movies', 'did', 'comedy', 'quality', 'big', 'money', 'horror', 'ray', 'book', 'weak', 'blu', 'action', 'dull', 'version', 'long', 'expected', 'star', 'doesn', 'make', 've', 'old', 'stupid', 'predictable', 'okay', 'lame', 'overrated', 'mediocre', 'oh', 'fun', 'horrible', 'season', 'seen', 'special', 'awful', 'hard', 'new', 'script', 'sequel', 'silly', 'say', 'man', 'classic', 'ending', 'real', 'flick', 'average', 'let', 'save', 'cast', 'fans', 'watching', 'buy', 'dumb', 'people', 'does', 'end', 'remake', 'potential', 'idea', 'effects', 'entertaining', 'fan', 'mess', 'video', 'meh', 'know', 'think', 'lost'

In [37]:
print('1:')
uniques1_text = [w for w in score1_200_words if (w not in score2_200_words and w not in score3_200_words and w not in score4_200_words and w not in score5_200_words )]
print(uniques1_text)
print('2:')
uniques2_text = [w for w in score2_200_words if (w not in score1_200_words and w not in score3_200_words and w not in score4_200_words and w not in score5_200_words )]
print(uniques2_text)
print('3:')
uniques3_text = [w for w in score3_200_words if (w not in score2_200_words and w not in score1_200_words and w not in score4_200_words and w not in score5_200_words )]
print(uniques3_text)
print('4:')
uniques4_text = [w for w in score4_200_words if (w not in score2_200_words and w not in score3_200_words and w not in score1_200_words and w not in score5_200_words )]
print(uniques4_text)
print('5:')
uniques5_text = [w for w in score5_200_words if (w not in score2_200_words and w not in score3_200_words and w not in score4_200_words and w not in score1_200_words )]
print(uniques5_text)

1:
2:
['missed', 'goes', 'premise', 'flat', 'hype', 'falls', 'director', 'sound', 'cheesy', 'confusing', 'blah']
3:
['mixed', 'moments', 'somewhat', 'watchable', 'sure', 'alright', 'eh', 'uneven', 'fair', 'expect']
4:
['surprisingly', 'adaptation', 'charming', 'noir', 'creepy', 'underrated', 'early', 'twist', 'extras', 'age', 'strong', 'suspense', 'fantasy', 'ride', 'worthy']
5:
['masterpiece', 'greatest', 'outstanding', 'superb', 'gift', 'shows', 'super', 'incredible', 'terrific', 'favorites', 'rocks', 'happy', 'trek', 'miss', 'fabulous', 'seasons', 'funniest', 'british', 'men', 'loves', 'television', 'laugh', 'stunning', 'highly', 'rock', 'memories', 'touching', 'finest', 'delightful', 'house', 'concert', 'home']


In [38]:
uniques_text = [uniques1_text, uniques2_text, uniques3_text, uniques4_text, uniques5_text]

In [65]:
X_train_origin_processed = preprocess(X_train_origin, ws_array_normalized, the_vocab, ws_array_normalized2, the_vocab2, film_dic, user_dic, film_dic_std, user_dic_std, avg_all_scores, uniques_summary, uniques_text)

In [66]:
X_sub = preprocess(X_submission, ws_array_normalized, the_vocab, ws_array_normalized2, the_vocab2, film_dic, user_dic, film_dic_std, user_dic_std, avg_all_scores, uniques_summary, uniques_text).drop(['Score'], axis=1)

In [67]:
X_train_origin_processed.to_csv("/content/gdrive/My Drive/Colab Notebooks/data/X_train_origin_processed.csv", index=False)
X_sub.to_csv("/content/gdrive/My Drive/Colab Notebooks/data/X_sub.csv", index=False)

In [68]:
X_train_origin_processed_nn = preprocess_nonnegative(X_train_origin, ws_array_normalized, the_vocab, ws_array_normalized2, the_vocab2, film_dic, user_dic, film_dic_std, user_dic_std, avg_all_scores, uniques_summary, uniques_text)

In [69]:
X_sub_nn = preprocess_nonnegative(X_submission, ws_array_normalized, the_vocab, ws_array_normalized2, the_vocab2, film_dic, user_dic, film_dic_std, user_dic_std, avg_all_scores, uniques_summary, uniques_text).drop(['Score'], axis=1)

In [70]:
X_train_origin_processed_nn.to_csv("/content/gdrive/My Drive/Colab Notebooks/data/X_train_origin_processed_nn.csv", index=False)
X_sub_nn.to_csv("/content/gdrive/My Drive/Colab Notebooks/data/X_sub_nn.csv", index=False)