## **Perform imports and load the dataset into a pandas dataframe**

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [11]:
from glob import glob
import pandas as pd

train_pos_path = glob('gdrive/MyDrive/Colab Notebooks/aclImdb/train/pos/*.txt')
train_neg_path = glob('gdrive/MyDrive/Colab Notebooks/aclImdb/train/neg/*.txt')

test_pos_path = glob('gdrive/MyDrive/Colab Notebooks/aclImdb/test/pos/*.txt')
test_neg_path = glob('gdrive/MyDrive/Colab Notebooks/aclImdb/test/neg/*.txt')

In [14]:
train_pos=[]
for path in train_pos_path[:1000]:
  train_pos.append(open(path, encoding='utf8').read())

train_neg=[]
for path in train_neg_path[:1000]:
  train_neg.append(open(path, encoding='utf8').read())

test_pos=[]
for path in train_pos_path[:1000]:
  test_pos.append(open(path, encoding='utf8').read())

test_neg=[]
for path in train_neg_path[:1000]:
  test_neg.append(open(path, encoding='utf8').read())

In [15]:
train = train_pos + train_neg
test = test_pos + test_neg

In [16]:
ltrain = []
for cm in range(len(train_pos)):
  ltrain.append('pos')
for cm in range(len(train_neg)):
  ltrain.append('neg')

ltest = []
for cm in range(len(test_pos)):
  ltest.append('pos')
for cm in range(len(test_neg)):
  ltest.append('neg')

In [33]:
df_train = pd.DataFrame(data={'Comments': train, 'Lables': ltrain})
df_test = pd.DataFrame(data={'Comments': test, 'Lables': ltest})

In [34]:
df_train

Unnamed: 0,Comments,Lables
0,This is a fine musical with a timeless score b...,pos
1,It tries to be the epic adventure of the centu...,pos
2,Claire Denis' debut is both a brave and self-a...,pos
3,A THIEF IN THE NIGHT is an excellent fictional...,pos
4,Mirage (1990) is a very rare horror/chiller fr...,pos
...,...,...
1995,"I don't remember ""Barnaby Jones"" being no more...",neg
1996,This is one of them movies that has a awesome ...,neg
1997,"I saw the MST3K version of ""Deathstalker III"" ...",neg
1998,I was searching through Hollywood video last n...,neg


## **Check for missing values**

In [20]:
df_train.dropna(inplace = True)

In [21]:
df_test.dropna(inplace = True)

In [32]:
if df_train.isnull().values.any():
    print('!!!')

In [37]:
if df_test.isnull().values.any():
    print('!!!')

# **Vectorize the data**

In [41]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

tfIdfVectorizer=TfidfVectorizer(use_idf=True)
tfIdf = tfIdfVectorizer.fit_transform(df_train['Comments'])
v_train = pd.DataFrame(tfIdf[0].T.todense(), index=tfIdfVectorizer.get_feature_names(), columns=["TF-IDF"])
v_train = v_train.sort_values('TF-IDF', ascending=False)
#print (v_train.head())

             TF-IDF
liked      0.338205
the        0.253977
dot        0.220839
composers  0.209516
parisien   0.209516


In [86]:
vector_train=[]
for text in df_train['Comments']:
  vector_train.append(tfIdfVectorizer.transform([text]).toarray()[0])

vector_test=[]
for text in df_test['Comments']:
  vector_test.append(tfIdfVectorizer.transform([text]).toarray()[0])

# **Train and fit the model**

In [94]:
from sklearn import svm
clf = svm.SVC()
clf.fit(vector_train, df_train['Lables'])

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

# **Run predictions and analyze the result**

In [95]:
predictions = clf.predict(vector_test)

In [97]:
from sklearn.metrics import accuracy_score
print(accuracy_score(df_test['Lables'],predictions))

0.9995
