In [5]:
from google.colab import files
import zipfile
import os

# Step 1: Upload the file
uploaded = files.upload()

# Step 2: Extract archive.zip
with zipfile.ZipFile("archive.zip", "r") as zip_ref:
    zip_ref.extractall(".")

# Step 3: Check extracted folders
print(os.listdir("."))          # should list txt_sentoken
print(os.listdir("txt_sentoken"))  # should list ['pos', 'neg']


Saving archive.zip to archive (1).zip
['.config', 'txt_sentoken', 'archive (1).zip', 'archive.zip', 'sample_data']
['neg', 'pos']


In [6]:
import os
import pandas as pd

# Paths
pos_path = "txt_sentoken/pos"
neg_path = "txt_sentoken/neg"

# Load positive reviews
pos_reviews = [open(os.path.join(pos_path, f), encoding="latin-1").read() for f in os.listdir(pos_path)]

# Load negative reviews
neg_reviews = [open(os.path.join(neg_path, f), encoding="latin-1").read() for f in os.listdir(neg_path)]

# Make DataFrame
df = pd.DataFrame({
    "text": pos_reviews + neg_reviews,
    "label": ["pos"] * len(pos_reviews) + ["neg"] * len(neg_reviews)
})

print(df.shape)
print(df.head())


(2000, 2)
                                                text label
0  some of my friends who went to live in usa com...   pos
1  jack nicholson has a funny way of playing char...   pos
2   " seven " is one of the best mystery movies i...   pos
3  bowfinger is a good movie about the making of ...   pos
4  what's shocking about " carlito's way " is how...   pos


In [7]:
print(df.isnull().sum())

text     0
label    0
dtype: int64


In [8]:
df['label']=df['label'].map({'pos':1, 'neg':0})

In [9]:
print(df.head())

                                                text  label
0  some of my friends who went to live in usa com...      1
1  jack nicholson has a funny way of playing char...      1
2   " seven " is one of the best mystery movies i...      1
3  bowfinger is a good movie about the making of ...      1
4  what's shocking about " carlito's way " is how...      1


In [29]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [48]:
df['text'] = df['text'].str.lower()


In [49]:
import re
df['text'] = df['text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))


In [50]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stop = set(stopwords.words('english'))
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop]))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [51]:
vectorizer=TfidfVectorizer(stop_words='english')
x=vectorizer.fit_transform(df['text'])

In [54]:
y=df['label']

In [55]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [56]:
model=MultinomialNB()
model.fit(x_train,y_train)

In [57]:
y_pred=model.predict(x_test)

In [58]:
print(f"Accuracy_Score: {accuracy_score(y_test,y_pred)}")
print(f"Confusion_Matrix: {confusion_matrix(y_test,y_pred)}")
print(f"Classification_Report: {classification_report(y_test,y_pred)}")

Accuracy_Score: 0.8
Confusion_Matrix: [[168  33]
 [ 47 152]]
Classification_Report:               precision    recall  f1-score   support

           0       0.78      0.84      0.81       201
           1       0.82      0.76      0.79       199

    accuracy                           0.80       400
   macro avg       0.80      0.80      0.80       400
weighted avg       0.80      0.80      0.80       400



#SVM

In [30]:
print(df.head())

                                                text  label
0  some of my friends who went to live in usa com...      1
1  jack nicholson has a funny way of playing char...      1
2   " seven " is one of the best mystery movies i...      1
3  bowfinger is a good movie about the making of ...      1
4  what's shocking about " carlito's way " is how...      1


In [38]:
df['text'] = df['text'].str.lower()


In [39]:
import re
df['text'] = df['text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))


In [40]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stop = set(stopwords.words('english'))
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop]))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [41]:
vectorizer=TfidfVectorizer(stop_words='english')
x=vectorizer.fit_transform(df['text'])

In [43]:
y=df['label']

In [44]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [45]:
model=SVC()
model.fit(x_train,y_train)

In [46]:
y_pred=model.predict(x_test)

In [47]:
print(f"Accuracy_Score: {accuracy_score(y_test,y_pred)}")
print(f"Confusion_Matrix: {confusion_matrix(y_test,y_pred)}")
print(f"Classification_Report: {classification_report(y_test,y_pred)}")

Accuracy_Score: 0.8
Confusion_Matrix: [[160  41]
 [ 39 160]]
Classification_Report:               precision    recall  f1-score   support

           0       0.80      0.80      0.80       201
           1       0.80      0.80      0.80       199

    accuracy                           0.80       400
   macro avg       0.80      0.80      0.80       400
weighted avg       0.80      0.80      0.80       400

