# Dataset2: sentiment-analysis-on-movie-reviews

In [104]:
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import pad_sequences
from collections import Counter
# from sklearn.model_selection import train_test_split

# import warnings
# warnings.filterwarnings('ignore', category=UserWarning, module='bs4')

import numpy as np

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [105]:
def read_file(path):
  rawdata = pd.read_excel(path)
  return rawdata

In [106]:
def preprocess_data(df):
  reviews = []
  for raw in tqdm(df['Phrase']):
      text = BeautifulSoup(raw, 'lxml').get_text()
      only_text = re.sub('[^a-zA-Z]', ' ', text)
      words = word_tokenize(only_text.lower())
      stops = set(stopwords.words('english'))
      non_stopwords = [word for word in words if not word in stops]
      lemma_words = [lemmatizer.lemmatize(word) for word in non_stopwords]    
      reviews.append(lemma_words)
  return reviews

In [107]:
def tokenizer_preprocess(list_X_train, list_X_val):
    unique_words = set()
    len_max = 0
    for sent in tqdm(list_X_train):
        unique_words.update(sent)
        if len_max < len(sent):
            len_max = len(sent)
    len(list(unique_words)), len_max

    tokenizer = Tokenizer(num_words=len(list(unique_words)))
    tokenizer.fit_on_texts(list(list_X_train))
     
    X_train = tokenizer.texts_to_sequences(list_X_train)
    X_train = pad_sequences(X_train, maxlen=len_max)

    X_val = tokenizer.texts_to_sequences(list_X_val)
    X_val = pad_sequences(X_val, maxlen=len_max)

    return X_train, X_val

In [108]:
# Function for standardizing data
def standardScaler(feature_array):
    num = feature_array.shape[1] # total number of columns
    for i in range(num): # iterating through each column
        feature = feature_array[:, i]
        mean = feature.mean() # mean stores mean value for the column
        std = feature.std() # std stores standard deviation value for the column
        feature_array[:, i] = (feature_array[:, i] - mean) / std # standard scaling of each element of the column
    return feature_array


In [109]:
def report(predictions, y_test):
    print('Accuracy: %s' % accuracy_score(y_test, predictions))
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, predictions))
    print('Classification Report:')
    print(classification_report(y_test, predictions))

## Decision Tree


In [110]:
def entropy(x):
    '''
    calculates entropy of x
    
    input_ : x (a list of values)
    output : float, entropy value
    '''
    counts = np.bincount(np.array(x, dtype=np.int64))
    percentages = counts / len(x)

    # Caclulate entropy

    entropy = 0
    for p in percentages:
        if p > 0:
            entropy += p * np.log2(p)
    entropy = -entropy
    return entropy

In [111]:
def information_gain(parent, left_child, right_child):
    '''
    calculates information gain of a node  
    
    input_ : parent_list, child_list (left child and right child)
    output : float, information gain value
    '''
    left_num = len(left_child) / len(parent)
    right_num = len(right_child) / len(parent)

    child = left_num * entropy(left_child) + right_num * entropy(right_child)
    
    return entropy(parent) - child

In [112]:
class Node:
    '''
    define the node in the decistion tree
    
    '''
    def __init__(self, feature=None, threshold=None, data_left=None, data_right=None, gain=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.data_left = data_left
        self.data_right = data_right
        self.gain = gain
        self.value = value

In [113]:
class DecisionTree:
  '''
  implementing decisicion tree  
  
  '''
  def __init__(self, min_samples_split=2, max_depth=3):
      self.min_samples_split = min_samples_split
      self.max_depth = max_depth
      self.root = None
      
  def _best_split(self, X, y):
      '''
      calculates the best split for given features and target  
      
      input_ : X = features, y = target 
      output : best_split (dict)
      '''
      best_split = {}
      best_info_gain = -1
      n_rows, n_cols = X.shape
      
      # For every dataset feature
      for f_idx in range(n_cols):
          X_curr = X[:, f_idx]
          # For every unique value of that feature
          for threshold in np.unique(X_curr):
              # Construct a dataset and split it to the left and right parts
              # Left part includes records lower or equal to the threshold
              # Right part includes records higher than the threshold
              df = np.concatenate((X, y.reshape(1, -1).T), axis=1)
              df_left = np.array([row for row in df if row[f_idx] <= threshold])
              df_right = np.array([row for row in df if row[f_idx] > threshold])

              # Do the calculation only if there's data in both subsets
              if len(df_left) > 0 and len(df_right) > 0:
                  # Obtain the value of the target variable for subsets
                  y = df[:, -1]
                  y_left = df_left[:, -1]
                  y_right = df_right[:, -1]

                  # Caclulate the information gain and save the split parameters
                  # if the current split if better then the previous best
                  gain = information_gain(y, y_left, y_right)
                  if gain > best_info_gain:
                      best_split = {
                          'feature_index': f_idx,
                          'threshold': threshold,
                          'df_left': df_left,
                          'df_right': df_right,
                          'gain': gain
                      }
                      best_info_gain = gain
      return best_split
  
  def _build(self, X, y, depth=0):
      '''
      build a decision tree   
      
      input_ : X = features, y = target, depth 
      output : node
      '''
      n_rows, n_cols = X.shape
      
      # Check to see if a node should be leaf node
      if n_rows >= self.min_samples_split and depth <= self.max_depth:
          # Get the best split
          best = self._best_split(X, y)
          # If the split isn't pure
          if best['gain'] > 0:
              # Build a tree on the left
              left = self._build(
                  X=best['df_left'][:, :-1], 
                  y=best['df_left'][:, -1], 
                  depth=depth + 1
              )
              right = self._build(
                  X=best['df_right'][:, :-1], 
                  y=best['df_right'][:, -1], 
                  depth=depth + 1
              )
              return Node(
                  feature=best['feature_index'], 
                  threshold=best['threshold'], 
                  data_left=left, 
                  data_right=right, 
                  gain=best['gain']
              )
      # Leaf node - value is the most common target value 
      return Node(
          value=Counter(y).most_common(1)[0][0]
      )
  
  def fit(self, X, y):
      '''
      Train with given features and target  
      
      input_ : X = features, y = target 
      output : //
      '''
      # Call a recursive function to build the tree
      self.root = self._build(X, y)
      
  def _predict(self, x, tree):
      '''
      classify a single test data  
      
      input_ : x (one input data)
      output : class (prediction)
      '''
      # Leaf node
      if tree.value != None:
          return tree.value
      feature_value = x[tree.feature]
      
      # Go to the left
      if feature_value <= tree.threshold:
          return self._predict(x=x, tree=tree.data_left)
      
      # Go to the right
      if feature_value > tree.threshold:
          return self._predict(x=x, tree=tree.data_right)
      
  def predict(self, X):
      '''
      classify all data
      
      :param X: np.array, features
      :return: np.array, predicted classes
      '''
      # Call the _predict() function for every observation
      return [self._predict(x, self.root) for x in X]

## Connect google drive and Import training data and testing data


In [114]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [115]:
# read_file(path): trianing_data features, trianing_data target, testing_data features 
train_x = read_file('/content/drive/MyDrive/ml_hw/Dataset2_train/Dataset2_train/X_train.xlsx')
train_y = read_file('/content/drive/MyDrive/ml_hw/Dataset2_train/Dataset2_train/y_train.xlsx')
test_x = read_file('/content/drive/MyDrive/ml_hw/Dataset2_test/X_test.xlsx')

## Dealing missing data

In [116]:
for feature in train_x:
  train_x[feature] = train_x[feature].fillna((train_x[feature][0]))

## Turn the input data to suitable datatye

In [117]:
train_text = preprocess_data(train_x)
test_text = preprocess_data(test_x)
# target = train_y.Sentiment.values
# X_train, X_val, y_train, y_val = train_test_split(train_text, target, test_size=0.2, stratify=target)

100%|██████████| 124848/124848 [01:14<00:00, 1683.55it/s]
100%|██████████| 31212/31212 [00:18<00:00, 1643.18it/s]


In [118]:
X_train_, X_test_ = tokenizer_preprocess(train_text, test_text)  

100%|██████████| 124848/124848 [00:00<00:00, 862460.19it/s]


In [119]:
features = np.array(X_train_)
target = np.array(train_y.to_numpy())
features_test = np.array(X_test_)
features.shape, target.shape, features_test.shape

((124848, 30), (124848, 1), (31212, 30))

In [120]:
features_scaled = standardScaler(X_train_) 
testfeat_scaled = standardScaler(X_test_)

  feature_array[:, i] = (feature_array[:, i] - mean) / std # standard scaling of each element of the column


In [121]:
X_train = np.array(features_scaled)
y_train = np.array(train_y['Sentiment'].to_numpy())
X_test = np.array(testfeat_scaled)
X_train.shape, y_train.shape, X_test.shape

((124848, 30), (124848,), (31212, 30))

## Data Observation



In [122]:
print(train_x['Phrase'].head(10))

0                           going to a house party and
1                                      a grand picture
2                                  lightweight meaning
3                                      most unpleasant
4    You can see the would-be surprises coming a mi...
5    this too-extreme-for-TV rendition of the notor...
6                    wickedly undramatic central theme
7    ... a fascinating curiosity piece -- fascinati...
8              fallible human beings , not caricatures
9    is so prolonged and boring it is n't even clos...
Name: Phrase, dtype: object


In [123]:
print(train_y['Sentiment'].describe())

count    124848.000000
mean          2.063581
std           0.893844
min           0.000000
25%           2.000000
50%           2.000000
75%           3.000000
max           4.000000
Name: Sentiment, dtype: float64


In [124]:
print(train_y['Sentiment'].value_counts())

2    63665
3    26342
1    21818
4     7365
0     5658
Name: Sentiment, dtype: int64


In [125]:
print(train_y['Sentiment'].value_counts()/train_y['Sentiment'].count())

2    0.509940
3    0.210993
1    0.174757
4    0.058992
0    0.045319
Name: Sentiment, dtype: float64


In [126]:
temp_df = train_x.isnull().sum().reset_index()
temp_df['Percentage of Null Values'] = temp_df[0]/len(train_x)*100
temp_df.columns = ['Column Name', 'Number of Null Values','Percentage of Null Values']
temp_df


Unnamed: 0,Column Name,Number of Null Values,Percentage of Null Values
0,Phrase,0,0.0


In [127]:
train_x.describe().T.style.background_gradient(cmap = "magma") 

Unnamed: 0,count,unique,top,freq
Phrase,124848,124847,going to a house party and,2


## Train with decision tree

In [128]:
model = DecisionTree()
model.fit(X_train, y_train)
preds = model.predict(X_test)

## Output data and Save the reult in excel file

In [129]:
df = pd.DataFrame(np.array(preds))
df.to_excel("Dataset2_pred.xlsx")