In [213]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import re
# Data Preprocessing
path = "RottenTomatoes/DataSet/train.tsv"
df = pd.read_csv(path, sep="\t") 

# remove punctuation
df['Phrase'] = df['Phrase'].str.replace(r'[^\w\s]+', '')
# remove numbers
df['Phrase'] = df['Phrase'].str.replace(r'\d+', '')
# make it all lower case
df['Phrase'] = df['Phrase'].str.lower()
# remove non-asci characters
df.Phrase.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
#df['Phrase'] = df['Phrase'].str.split()

df['Tokenized_text'] = df['Phrase'].apply(word_tokenize) 

df['Sentiment']=df['Sentiment'].astype(int) #convert the star_rating column to int
df['NNLabels'] = df['Sentiment'].div(4)

df= df[df['Sentiment']!=2]

#df['label']=np.where(df['Sentiment']>=4,1,0) #1-Positve,0-Negative
# convert to NumPy Array
#train = df['Phrase'].to_numpy()
from sklearn.model_selection import train_test_split
from nltk.stem.porter import *
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

X_train,X_test,Y_train, Y_test = train_test_split(df['Phrase'], df['Sentiment'], test_size=0.2, random_state=30)
print("Train: " ,X_train.shape,Y_train.shape,"Test: ",(X_test.shape,Y_test.shape))

X_train,X_valid,Y_train, Y_valid = train_test_split(X_train,Y_train, test_size=0.5, random_state=30)
print("Train: " ,X_train.shape,Y_train.shape,"Test: ",(X_valid.shape,Y_valid.shape))
tfidfvectorizer = TfidfVectorizer(analyzer='word',stop_words= 'english')
tf_x_train = tfidfvectorizer.fit_transform(X_train)
tf_x_valid = tfidfvectorizer.transform(X_valid)


[nltk_data] Downloading package punkt to /Users/wjones/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  df['Phrase'] = df['Phrase'].str.replace(r'[^\w\s]+', '')
  df['Phrase'] = df['Phrase'].str.replace(r'\d+', '')


Train:  (61182,) (61182,) Test:  ((15296,), (15296,))
Train:  (30591,) (30591,) Test:  ((30591,), (30591,))


In [214]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import StandardScaler
K = 55
scale = StandardScaler()
X_train_normalized = scale.fit_transform(tf_x_train.toarray())
X_valid_normalized = scale.fit_transform(tf_x_valid.toarray())


In [215]:
kmeans = MiniBatchKMeans(n_clusters=K, random_state=10, n_init=10)
nn_X_train = kmeans.fit_transform(tf_x_train.toarray().astype(float))
preds = kmeans.fit_transform(tf_x_valid.toarray().astype(float))

In [216]:
nn_X_train_normalized = scale.fit_transform(nn_X_train)
nn_X_valid_normalized = scale.fit_transform(preds)

In [217]:
print(nn_X_valid_normalized.shape)
print(nn_X_train_normalized.shape)

(30591, 55)
(30591, 55)


In [232]:
#from sklearn.preprocessing import OneHotEncoder
#encoder = OneHotEncoder(sparse=False)

In [219]:
#nn_Y_train = encoder.fit_transform(Y_train.array.reshape(-1,1))
#nn_Y_valid = encoder.fit_transform(Y_valid.array.reshape(-1,1))

In [220]:
#nn_Y_valid

In [221]:
import torch
import torch.nn as nn

In [224]:
class MLP(nn.Module):
  '''
    Multilayer Perceptron.
  '''
  def __init__(self):
    super().__init__()
    self.layers = nn.Sequential(
      nn.Flatten(),
      nn.Linear(55, 64),
      nn.ReLU(),
      nn.Linear(64, 32),
      nn.ReLU(),
      nn.Linear(32, 16),
      nn.ReLU(),
      nn.Linear(16,5),
      nn.LogSoftmax(dim=1)
    )


  def forward(self, x):
    '''Forward pass'''
    return self.layers(x)

In [225]:
import torch
import torch.nn as nn
# Define the number of input features



Y_tensor = torch.tensor(Y_train.array).long()
X_tensor = torch.tensor(nn_X_train_normalized).float()
mlp = MLP()
#print(X_tensor.shape)
# make a dataset object
dataset = torch.utils.data.TensorDataset(X_tensor, Y_tensor)

# then use it to make a dataloader object
trainloader = torch.utils.data.DataLoader(dataset, batch_size=10, shuffle=True, num_workers=4)
# make a scoring function
loss_function = nn.NLLLoss()
optimizer = torch.optim.Adam(mlp.parameters(), lr=0.001)

num_epochs = 50

for epoch in range(0, num_epochs): # num_epochs at maximum

      # Print epoch
      print(f'Starting epoch {epoch+1}')

      # Set current loss value
      current_loss = 0.0
      
      for i, data in enumerate(trainloader, 0):

            # Get inputs
            inputs, targets = data
            
            # Prepare targets
            #targets = targets.type(torch.FloatTensor).reshape((targets.shape[0], 4))

            # Zero the gradients
            optimizer.zero_grad()

            # Perform forward pass
            outputs = mlp(inputs)
            #print(outputs.shape)
            #print(targets[4].value)
            # Compute loss
            loss = loss_function(outputs, targets)

            # Perform backward pass
            loss.backward()

            # Perform optimization
            optimizer.step()

            # Print statistics

            current_loss += loss.item()
            if i % 500 == 499:
                  print('Loss after mini-batch %5d: %.3f' %
                        (i + 1, current_loss / 500))
                  current_loss = 0.0



#plt.scatter(trainX[:,0], trainX[:, 1], c=pred_labels)


Starting epoch 1
Loss after mini-batch   500: 1.267
Loss after mini-batch  1000: 1.196
Loss after mini-batch  1500: 1.170
Loss after mini-batch  2000: 1.174
Loss after mini-batch  2500: 1.170
Loss after mini-batch  3000: 1.158
Starting epoch 2
Loss after mini-batch   500: 1.152
Loss after mini-batch  1000: 1.170
Loss after mini-batch  1500: 1.165
Loss after mini-batch  2000: 1.166
Loss after mini-batch  2500: 1.170
Loss after mini-batch  3000: 1.150
Starting epoch 3
Loss after mini-batch   500: 1.156
Loss after mini-batch  1000: 1.142
Loss after mini-batch  1500: 1.158
Loss after mini-batch  2000: 1.161
Loss after mini-batch  2500: 1.153
Loss after mini-batch  3000: 1.174
Starting epoch 4
Loss after mini-batch   500: 1.145
Loss after mini-batch  1000: 1.152
Loss after mini-batch  1500: 1.136
Loss after mini-batch  2000: 1.169
Loss after mini-batch  2500: 1.157
Loss after mini-batch  3000: 1.153
Starting epoch 5
Loss after mini-batch   500: 1.160
Loss after mini-batch  1000: 1.144
Loss 

In [230]:

X_valid_tensor = torch.tensor(nn_X_valid_normalized).float()
preds = mlp(X_tensor).detach().numpy()


In [227]:
preds

array([[  -3.7406251 ,   -0.16749978, -124.57078   ,   -2.045095  ,
          -6.7979527 ],
       [  -3.0730174 ,   -0.26824874, -103.289604  ,   -1.6845431 ,
          -5.6630354 ],
       [  -2.3391626 ,   -0.51748246,  -90.87758   ,   -1.2252581 ,
          -4.2765393 ],
       ...,
       [  -3.0724888 ,   -0.2683256 , -103.277145  ,   -1.6844051 ,
          -5.6605167 ],
       [  -3.0729973 ,   -0.26828748, -103.28547   ,   -1.6844013 ,
          -5.6623297 ],
       [  -3.0729558 ,   -0.2683688 , -103.2768    ,   -1.6841044 ,
          -5.6608524 ]], dtype=float32)

In [228]:
test_output_labels = np.argmax(preds,1)


In [231]:
from sklearn.metrics import classification_report
report=classification_report(Y_train.array, test_output_labels.astype(int),output_dict=True)
report

{'0': {'precision': 0.060240963855421686,
  'recall': 0.001787629603146228,
  'f1-score': 0.003472222222222222,
  'support': 2797},
 '1': {'precision': 0.3562447611064543,
  'recall': 0.8990159109721328,
  'f1-score': 0.5102839841302986,
  'support': 10873},
 '3': {'precision': 0.44860994102780116,
  'recall': 0.08056585218246463,
  'f1-score': 0.13659975630090426,
  'support': 13219},
 '4': {'precision': 0.14388489208633093,
  'recall': 0.02701242571582928,
  'f1-score': 0.04548555833522856,
  'support': 3702},
 'accuracy': 0.35778496943545485,
 'macro avg': {'precision': 0.252245139519002,
  'recall': 0.2520954546183932,
  'f1-score': 0.17396038024716343,
  'support': 30591},
 'weighted avg': {'precision': 0.3433944606048901,
  'recall': 0.35778496943545485,
  'f1-score': 0.24622043347726333,
  'support': 30591}}

In [233]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
reports = []

for i in range(10,50):
  scale = StandardScaler()
  X_train_normalized = scale.fit_transform(tf_x_train.toarray())
  X_valid_normalized = scale.fit_transform(tf_x_valid.toarray())

  kmeans = MiniBatchKMeans(n_clusters=i, random_state=10, n_init=10)
  nn_X_train = kmeans.fit_transform(tf_x_train.toarray().astype(float))
  preds = kmeans.fit_transform(tf_x_valid.toarray().astype(float))
  nn_X_train_normalized = scale.fit_transform(nn_X_train)
  nn_X_valid_normalized = scale.fit_transform(preds)
  print(nn_X_valid_normalized.shape)
  print(nn_X_train_normalized.shape)
  #from sklearn.preprocessing import OneHotEncoder
  #encoder = OneHotEncoder(sparse=False)
  #nn_Y_train = encoder.fit_transform(Y_train.array.reshape(-1,1))
  #nn_Y_valid = encoder.fit_transform(Y_valid.array.reshape(-1,1))
  #nn_Y_valid
  import torch
  import torch.nn as nn
  class MLP(nn.Module):
    '''
      Multilayer Perceptron.
    '''
    def __init__(self):
      super().__init__()
      self.layers = nn.Sequential(
        nn.Flatten(),
        nn.Linear(i, 64),
        nn.ReLU(),
        nn.Linear(64, 32),
        nn.ReLU(),
        nn.Linear(32, 16),
        nn.ReLU(),
        nn.Linear(16,5),
        nn.LogSoftmax(dim=1)
      )


    def forward(self, x):
      '''Forward pass'''
      return self.layers(x)

  import torch
  import torch.nn as nn
  # Define the number of input features



  Y_tensor = torch.tensor(Y_train.array).long()
  X_tensor = torch.tensor(nn_X_train_normalized).float()
  mlp = MLP()
  #print(X_tensor.shape)
  # make a dataset object
  dataset = torch.utils.data.TensorDataset(X_tensor, Y_tensor)

  # then use it to make a dataloader object
  trainloader = torch.utils.data.DataLoader(dataset, batch_size=10, shuffle=True, num_workers=4)
  # make a scoring function
  loss_function = nn.NLLLoss()
  optimizer = torch.optim.Adam(mlp.parameters(), lr=0.001)

  num_epochs = 50

  for epoch in range(0, num_epochs): # num_epochs at maximum

        # Print epoch
        print(f'Starting epoch {epoch+1}')

        # Set current loss value
        current_loss = 0.0
        
        for i, data in enumerate(trainloader, 0):

              # Get inputs
              inputs, targets = data
              
              # Prepare targets
              #targets = targets.type(torch.FloatTensor).reshape((targets.shape[0], 4))

              # Zero the gradients
              optimizer.zero_grad()

              # Perform forward pass
              outputs = mlp(inputs)
              #print(outputs.shape)
              #print(targets[4].value)
              # Compute loss
              loss = loss_function(outputs, targets)

              # Perform backward pass
              loss.backward()

              # Perform optimization
              optimizer.step()

              # Print statistics

              current_loss += loss.item()
              if i % 500 == 499:
                    print('Loss after mini-batch %5d: %.3f' %
                          (i + 1, current_loss / 500))
                    current_loss = 0.0



  #plt.scatter(trainX[:,0], trainX[:, 1], c=pred_labels)



  X_valid_tensor = torch.tensor(nn_X_valid_normalized).float()
  preds = mlp(X_tensor).detach().numpy()

  preds
  test_output_labels = np.argmax(preds,1)

  report=classification_report(Y_train.array, test_output_labels.astype(int),output_dict=True)
  reports.append(report)
  print(report)

(30591, 10)
(30591, 10)
Starting epoch 1
Loss after mini-batch   500: 1.281
Loss after mini-batch  1000: 1.201
Loss after mini-batch  1500: 1.203
Loss after mini-batch  2000: 1.183
Loss after mini-batch  2500: 1.181
Loss after mini-batch  3000: 1.176
Starting epoch 2
Loss after mini-batch   500: 1.190
Loss after mini-batch  1000: 1.173
Loss after mini-batch  1500: 1.182
Loss after mini-batch  2000: 1.181
Loss after mini-batch  2500: 1.186
Loss after mini-batch  3000: 1.183
Starting epoch 3
Loss after mini-batch   500: 1.186
Loss after mini-batch  1000: 1.179
Loss after mini-batch  1500: 1.185
Loss after mini-batch  2000: 1.163
Loss after mini-batch  2500: 1.173
Loss after mini-batch  3000: 1.155
Starting epoch 4
Loss after mini-batch   500: 1.172
Loss after mini-batch  1000: 1.168
Loss after mini-batch  1500: 1.172
Loss after mini-batch  2000: 1.163
Loss after mini-batch  2500: 1.179
Loss after mini-batch  3000: 1.152
Starting epoch 5
Loss after mini-batch   500: 1.176
Loss after mini-

ERROR: Unexpected segmentation fault encountered in worker.
 

RuntimeError: DataLoader worker (pid 36683) is killed by signal: Segmentation fault: 11. 

In [235]:
torch.save(mlp.state_dict(), 'KMeans/model.pth')

In [None]:
reports = []

In [239]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

i=40

scale = StandardScaler()
X_train_normalized = scale.fit_transform(tf_x_train.toarray())
X_valid_normalized = scale.fit_transform(tf_x_valid.toarray())

kmeans = MiniBatchKMeans(n_clusters=i, random_state=10, n_init=10)
nn_X_train = kmeans.fit_transform(tf_x_train.toarray().astype(float))
preds = kmeans.fit_transform(tf_x_valid.toarray().astype(float))
nn_X_train_normalized = scale.fit_transform(nn_X_train)
nn_X_valid_normalized = scale.fit_transform(preds)
print(nn_X_valid_normalized.shape)
print(nn_X_train_normalized.shape)
#from sklearn.preprocessing import OneHotEncoder
#encoder = OneHotEncoder(sparse=False)
#nn_Y_train = encoder.fit_transform(Y_train.array.reshape(-1,1))
#nn_Y_valid = encoder.fit_transform(Y_valid.array.reshape(-1,1))
#nn_Y_valid
import torch
import torch.nn as nn
class MLP(nn.Module):
  '''
    Multilayer Perceptron.
  '''
  def __init__(self):
    super().__init__()
    self.layers = nn.Sequential(
      nn.Flatten(),
      nn.Linear(i, 64),
      nn.ReLU(),
      nn.Linear(64, 128),
      nn.ReLU(),
      nn.Linear(128, 256),
      nn.ReLU(),
      nn.Linear(256,64),
      nn.ReLU(),
      nn.Linear(64,5),
      nn.LogSoftmax(dim=1)
    )


  def forward(self, x):
    '''Forward pass'''
    return self.layers(x)

import torch
import torch.nn as nn
# Define the number of input features



Y_tensor = torch.tensor(Y_train.array).long()
X_tensor = torch.tensor(nn_X_train_normalized).float()
mlp = MLP()
#print(X_tensor.shape)
# make a dataset object
dataset = torch.utils.data.TensorDataset(X_tensor, Y_tensor)

# then use it to make a dataloader object
trainloader = torch.utils.data.DataLoader(dataset, batch_size=10, shuffle=True, num_workers=4)
# make a scoring function
loss_function = nn.NLLLoss()
optimizer = torch.optim.Adam(mlp.parameters(), lr=0.001)

num_epochs = 100

for epoch in range(0, num_epochs): # num_epochs at maximum

      # Print epoch
      print(f'Starting epoch {epoch+1}')

      # Set current loss value
      current_loss = 0.0
      
      for i, data in enumerate(trainloader, 0):

            # Get inputs
            inputs, targets = data
            
            # Prepare targets
            #targets = targets.type(torch.FloatTensor).reshape((targets.shape[0], 4))

            # Zero the gradients
            optimizer.zero_grad()

            # Perform forward pass
            outputs = mlp(inputs)
            #print(outputs.shape)
            #print(targets[4].value)
            # Compute loss
            loss = loss_function(outputs, targets)

            # Perform backward pass
            loss.backward()

            # Perform optimization
            optimizer.step()

            # Print statistics

            current_loss += loss.item()
            if i % 500 == 499:
                  print('Loss after mini-batch %5d: %.3f' %
                        (i + 1, current_loss / 500))
                  current_loss = 0.0



#plt.scatter(trainX[:,0], trainX[:, 1], c=pred_labels)



X_valid_tensor = torch.tensor(nn_X_valid_normalized).float()
preds = mlp(X_tensor).detach().numpy()

preds
test_output_labels = np.argmax(preds,1)

report=classification_report(Y_train.array, test_output_labels.astype(int),output_dict=True)
reports.append((report, i))
print(report)

(30591, 40)
(30591, 40)
Starting epoch 1
Loss after mini-batch   500: 1.233
Loss after mini-batch  1000: 1.194
Loss after mini-batch  1500: 1.193
Loss after mini-batch  2000: 1.184
Loss after mini-batch  2500: 1.189
Loss after mini-batch  3000: 1.179
Starting epoch 2
Loss after mini-batch   500: 1.176
Loss after mini-batch  1000: 1.174
Loss after mini-batch  1500: 1.178
Loss after mini-batch  2000: 1.202
Loss after mini-batch  2500: 1.180
Loss after mini-batch  3000: 1.184
Starting epoch 3
Loss after mini-batch   500: 1.173
Loss after mini-batch  1000: 1.184
Loss after mini-batch  1500: 1.168
Loss after mini-batch  2000: 1.179
Loss after mini-batch  2500: 1.182
Loss after mini-batch  3000: 1.177
Starting epoch 4
Loss after mini-batch   500: 1.175
Loss after mini-batch  1000: 1.183
Loss after mini-batch  1500: 1.167
Loss after mini-batch  2000: 1.173
Loss after mini-batch  2500: 1.181
Loss after mini-batch  3000: 1.169
Starting epoch 5
Loss after mini-batch   500: 1.166
Loss after mini-