In [1]:
import re
import random
import spacy
import pandas as pd
from typing import Set
from textblob import TextBlob

from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score

### Loading and displaying basic info of the movie dataset

In [2]:
df = pd.read_csv('data/movie1.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  20000 non-null  int64 
 1   text        20000 non-null  object
 2   label       20000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 468.9+ KB


In [3]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,text,label
0,5296,The original Road House was a classic cheesy 8...,0
1,592,"Great voices, lots of adventure and clever dia...",1
2,11403,Pumpkinhead was in itself a decent 80s horror ...,0
3,23541,This is another of Robert Altman's underrated ...,1
4,28770,This wasn't the major disaster that I was expe...,0
5,6431,"This is Jackie Chan's best film, and my person...",1
6,25399,I've heard people compare this movie to Sidewa...,0
7,19690,"Yes, that's right, it is. I firmly believe tha...",1
8,19235,For readers who have already seen one of Miyaz...,1
9,29550,"""The Days"" is a typical family drama with a li...",1


### Preprocessing text data

In [4]:
def preprocess_text(text: str, stop_words: Set[str]) -> str:
    text = re.sub(r'[^a-zA-Z\s]', '', text, flags=re.IGNORECASE).lower().strip()
    tokens = wordpunct_tokenize(text)
    filtered_tokens = [t for t in tokens if t not in stop_words]
    return ' '.join(filtered_tokens)

stoplist = set(stopwords.words('english'))
df['text'] = df['text'].apply(lambda doc: preprocess_text(doc, stoplist))
df.head(10)

Unnamed: 0.1,Unnamed: 0,text,label
0,5296,original road house classic cheesy movie altho...,0
1,592,great voices lots adventure clever dialogue ma...,1
2,11403,pumpkinhead decent horror flick classic means ...,0
3,23541,another robert altmans underrated filmslets ho...,1
4,28770,wasnt major disaster expecting positive descri...,0
5,6431,jackie chans best film personal favourite disa...,1
6,25399,ive heard people compare movie sideways compar...,0
7,19690,yes thats right firmly believe n weird looking...,1
8,19235,readers already seen one miyazakis films still...,1
9,29550,days typical family drama little catch must re...,1


### Training and evaluating an SVC model for sentiment analysis

In [5]:
X, y = df['text'], df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

clf = SVC()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(f'Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}')
print(f'Accuracy: {round(accuracy_score(y_test, y_pred), 3)}')

Confusion Matrix:
[[1773  258]
 [ 199 1770]]
Accuracy: 0.886


### Predicting sentiment using TextBlob 

In [6]:
def get_sentiment(text: str) -> int:
    analysis = TextBlob(text)
    return 1 if analysis.sentiment.polarity > 0 else 0

df['predicted_label'] = df['text'].apply(get_sentiment)
y_true, y_pred = df['label'], df['predicted_label']

print(f'Confusion Matrix:\n{confusion_matrix(y_true, y_pred)}')
print(f'Accuracy: {round(accuracy_score(y_true, y_pred), 3)}')

Confusion Matrix:
[[4677 5332]
 [ 669 9322]]
Accuracy: 0.7


### Comparing sentiment predictions from TextBlob and SVC for randomly selected text samples

In [7]:
for idx in random.sample(range(len(df)), 3):
    text, true_label = df.loc[idx, 'text'], df.loc[idx, 'label']

    predicted_label_textblob = get_sentiment(text)

    text_transformed = vectorizer.transform([text])
    predicted_label_svc = clf.predict(text_transformed)[0]

    print(f'Text #{idx}: {text[:100]}...')
    print(f'True label: {true_label}')
    print(f'Predicted label (TextBlob): {predicted_label_textblob}')
    print(f'Predicted label (SVC): {predicted_label_svc}\n')

Text #3728: time culture reality exposed narrative overpowering fiction know small big screen apart film exposes...
True label: 1
Predicted label (TextBlob): 1
Predicted label (SVC): 1

Text #4558: warmest engaging movie genre lips eyes made smile cry reminded work takes pursue dream pain disappoi...
True label: 1
Predicted label (TextBlob): 1
Predicted label (SVC): 1

Text #18673: one shows wanted followup id couldnt bring devoting time show show centers topic politics really nee...
True label: 0
Predicted label (TextBlob): 1
Predicted label (SVC): 0



### Extracting stop words, nouns, persons, and dates from the text using Spacy

In [8]:
with open('data/lab6-2.txt', 'r') as file:
    content = file.read()

nlp = spacy.load('en_core_web_sm')
doc = nlp(content)

stop_words = set([token.text for token in doc if token.is_stop])
print(f'Stop words in the text:\n{stop_words}\n')

nouns = [token.text for token in doc if token.pos_ == 'NOUN']
print(f'Nouns in the text:\n{nouns}\n')

persons = [ent.text for ent in doc.ents if ent.label_ == 'PERSON']
print(f'Persons in the text:\n{persons}\n')

dates = [ent.text for ent in doc.ents if ent.label_ == 'DATE']
print(f'Dates in the text:\n{dates}\n')

Stop words in the text:
{'The', 'the', 'his', 'an', 'for', 'where', 'last', 'A', 'to', 'then', 'before', 'He', 'when', 'from', 'of', 'on', 'was', 'at', 'while', 'he', 'more', 'as', 'one', 'is', 'became', 'also', 'May', 'And', 'with', "'s", 'and', 'through', 'in', 'a', 'who'}

Nouns in the text:
['rise', 'ranks', 'government', 'promotion', 'minister', 'seat', 'job', 'school', 'standards', 'minister', 'role', 'election', 'figure', 'head', 'policy', 'unit', 'member', 'manifesto', 'team', 'figures', 'government', 'policy', 'unit', 'leader', 'opposition', 'glance', 'family', 'background', 'pedigree', 'form', 'father', 'lieutenant', 'brother', 'advisor', 'time', 'year', 'politics', 'philosophy', 'economics', 'MSc', 'science']

Persons in the text:
["David Miliband's", 'Tony Blair', 'Ralph', 'David Miliband', 'Blair', 'Ed', 'Gordon Brown', 'Mr Blair', 'David Miliband']

Dates in the text:
['2001', 'May 2002', '1994', '39-year-old']

