# Mod 6.1: Working with Text & Introduction to Machine Learning

## Working With Strings

In [1]:
review = "  The Dark Knight (2008) - AMAZING movie!!! Heath Ledger was incredible.  "
print(review)

  The Dark Knight (2008) - AMAZING movie!!! Heath Ledger was incredible.  


In [2]:
clean_review = review.strip() ## removes unnecessary whitespaces
print(clean_review)

The Dark Knight (2008) - AMAZING movie!!! Heath Ledger was incredible.


In [3]:
replace_review = review.replace("movie", "film")
print(replace_review)

  The Dark Knight (2008) - AMAZING film!!! Heath Ledger was incredible.  


In [5]:
upper = review.upper()
print(upper)

lower = review.lower()
print(lower)

title  = review.title()
print(title)

  THE DARK KNIGHT (2008) - AMAZING MOVIE!!! HEATH LEDGER WAS INCREDIBLE.  
  the dark knight (2008) - amazing movie!!! heath ledger was incredible.  
  The Dark Knight (2008) - Amazing Movie!!! Heath Ledger Was Incredible.  


In [19]:
wordlist = review.split(" ")
print(wordlist)

['', '', 'The', 'Dark', 'Knight', '(2008)', '-', 'AMAZING', 'movie!!!', 'Heath', 'Ledger', 'was', 'incredible.', '', '']


## Working with Strings Using `regex`

In [27]:
import re

In [20]:
replace_a = re.sub(r'a', "A", review)
print(replace_a)

  The DArk Knight (2008) - AMAZING movie!!! HeAth Ledger wAs incredible.  


In [13]:
no_numbers = re.sub(r'\d', '', review)
print(no_numbers)

  The Dark Knight () - AMAZING movie!!! Heath Ledger was incredible.  


In [17]:
no_spaces = re.sub(r'\s+', " ", review)
print(no_spaces)

 The Dark Knight (2008) - AMAZING movie!!! Heath Ledger was incredible. 


In [12]:
## syntax: re.sub(pattern, replacement, text)
clean = re.sub(r'[^\w\s]', '', review)
print(clean)

  The Dark Knight 2008  AMAZING movie Heath Ledger was incredible  


## What to Do with Pre-Processed Text

In [34]:
no_punctuation = re.sub(r'[^\w\s]', '', review)
no_extra_spaces = no_spaces = re.sub(r'\s+', " ", no_punctuation)
clean = no_extra_spaces.lower()
clean_split = clean.split()

print(clean)
print(clean_split)

this drama takes itself way too seriously and becomes unintentionally funny the performances are overthetop and melodramatic to the point of parody at nearly three hours long it drags endlessly with nothing meaningful to say i struggled to stay awake during the final act
['this', 'drama', 'takes', 'itself', 'way', 'too', 'seriously', 'and', 'becomes', 'unintentionally', 'funny', 'the', 'performances', 'are', 'overthetop', 'and', 'melodramatic', 'to', 'the', 'point', 'of', 'parody', 'at', 'nearly', 'three', 'hours', 'long', 'it', 'drags', 'endlessly', 'with', 'nothing', 'meaningful', 'to', 'say', 'i', 'struggled', 'to', 'stay', 'awake', 'during', 'the', 'final', 'act']


In [35]:
review1 = "The Dark Knight completely reinvented the superhero genre with its dark, gritty realism. Heath Ledger's performance as the Joker is absolutely mesmerizing and unforgettable. The action sequences are expertly crafted and the story keeps you on the edge of your seat. This is easily one of the best comic book movies ever made."
review2 = "Parasite is a masterclass in storytelling and social commentary. The film seamlessly blends dark comedy with intense thriller elements. Every shot is beautifully composed and the performances are phenomenal. It absolutely deserved its Best Picture Oscar."
review3 = "Everything Everywhere All at Once is pure creative genius. The multiverse concept is handled with incredible imagination and heart. Michelle Yeoh delivers a career-best performance that's both funny and deeply moving. This movie is an absolute joy from start to finish."
review4 = "Inception is a mind-bending thriller that rewards multiple viewings. The layered plot is complex but never confusing thanks to sharp direction. The visual effects are stunning and the score is iconic. Christopher Nolan proves once again why he's a master filmmaker."
review5 = "Get Out brilliantly uses horror to explore racial tensions in America. Jordan Peele's directorial debut is smart, scary, and completely original. The social commentary is sharp without being preachy. Every twist is earned and the ending is perfect."
review6 = "The latest Marvel movie feels like a soulless cash grab with no real substance. The CGI looks rushed and unfinished in several key scenes. The plot is predictable and the jokes fall completely flat. I was checking my watch halfway through this bloated mess."
review7 = "This romantic comedy relies on every tired cliché in the book. The chemistry between the leads is nonexistent and painfully awkward. The dialogue sounds like it was written by someone who has never had a real conversation. Save your money and skip this forgettable disaster."
review8 = "The sequel completely ruins what made the original special. They've replaced clever writing with loud explosions and meaningless action. The characters have lost all depth and just spout one-liners constantly. This franchise officially jumped the shark."
review9 = "The horror movie fails to deliver any genuine scares or tension. Jump scares are overused and predictable throughout the entire runtime. The plot makes no sense and leaves countless questions unanswered. The ending is especially disappointing and feels completely rushed."
review10 = "This drama takes itself way too seriously and becomes unintentionally funny. The performances are over-the-top and melodramatic to the point of parody. At nearly three hours long, it drags endlessly with nothing meaningful to say. I struggled to stay awake during the final act."

## labels for checking
labels = ["pos", "pos", "pos", "pos", "pos", "neg", "neg", "neg", "neg", "neg"]

## reviews in a list
reviews = [review1, review2, review3, review4, review5, review6, review7, review8, review9, review10]

In [None]:
pos_markers = ["amazing", "great", "brilliant", "beautiful", "best", "stunning", "masterpiece", "perfect", "excellent", "incredible", "fantastic", "wonderful", "outstanding", "superb", "phenomenal", "genius", "mesmerizing", "unforgettable", "masterclass", "joy"]
neg_markers = ["bad", "cliché", "boring", "disappointing", "terrible", "awful", "worst", "horrible", "painful", "waste", "mess", "disaster", "fails", "rushed", "predictable", "flat", "awkward", "tiresome", "bloated", "struggled", "nothing", "long", "pain", "awkward", "less"]

In [38]:
count_pos = 0
count_neg = 0

for review in reviews:
    for pos_word in pos_markers:
        count_pos += review.count(pos_word)
        
    for neg_word in neg_markers:
        count_neg += review.count(neg_word)
        
    if count_pos > count_neg:
        print("positive review")
    elif count_neg > count_pos:
        print("negative review")
    else:
        print("unclear")  

positive review
positive review
positive review
positive review
positive review
positive review
positive review
positive review
unclear
negative review


## Supervised Learning

### Naive Bayes

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [6]:
imdb_dataframe = pd.read_csv("imdb_dataset.csv")
imdb_dataframe.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
imdb_dataframe.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [8]:
X = imdb_dataframe['review']
y = imdb_dataframe['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training set size:", X_train.shape[0])
print("Testing set size:", X_test.shape[0])

Training set size: 40000
Testing set size: 10000


In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train) 
X_test_vectorized = vectorizer.transform(X_test)

## what vectoriser does:
## cleans text and covnerts to lowercase    
## tokenizes text into words
## creates a vocabulary of unique words

In [11]:
from sklearn.naive_bayes import MultinomialNB

In [12]:
model = MultinomialNB()
model.fit(X_train_vectorized, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [13]:
for review in reviews:
    review_vectorized = vectorizer.transform([review])
    prediction = model.predict(review_vectorized)
    print(prediction[0])

NameError: name 'reviews' is not defined

In [None]:
predictions = model.predict(X_test_vectorized)
correct = 0

for i in range(len(predictions)):
    if predictions[i] == y_test.iloc[i]:
        correct += 1
        
print("Accuracy:", correct / len(predictions))

Accuracy: 0.8488


### Logistic Regression

Assumes a linear relationship between the independent variable and the outcome

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vectorized, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [None]:
predictions = model.predict(X_test_vectorized)
correct = 0

for i in range(len(predictions)):
    if predictions[i] == y_test.iloc[i]:
        correct += 1
        
print("Accuracy:", correct / len(predictions))

Accuracy: 0.8879


In [None]:
for review in reviews:
    review_vectorized = vectorizer.transform([review])
    prediction = model.predict(review_vectorized)
    print(prediction[0])

positive
positive
positive
positive
positive
negative
negative
negative
negative
negative


## Unsupervised Learning

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score

In [15]:
kmeans = KMeans(n_clusters=2, random_state=42)
cluster_labels = kmeans.fit_predict(X_train_vectorized)

In [16]:
labels = np.where(y_train == 'positive', 1, 0)

In [None]:
acc = accuracy_score(labels, cluster_labels)
print("Clustering accuracy:", acc)

Clustering accuracy: 0.510975
Confusion Matrix:
 [[17079  2960]
 [16601  3360]]


In [89]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_train_vectorized)

In [90]:
import plotly.express as px

In [99]:
df_pca = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
df_pca['Cluster'] = cluster_labels.astype(str)
df_pca['True_Label'] = labels.astype(str)

df_pca.head()

Unnamed: 0,PC1,PC2,Cluster,True_Label
0,14.716342,-0.601526,1,0
1,7.506085,-0.848957,0,0
2,-11.449313,-2.393843,0,1
3,-7.392421,-2.830449,0,0
4,7.382555,7.538043,0,0


In [103]:
fig = px.scatter(
    df_pca,
    x='PC1',
    y='PC2',
    color='Cluster', 
    symbol='True_Label',
    hover_data=['Cluster', 'True_Label'],
    title='K-Means Clusters Visualized',
    labels={'PC1': 'PC1', 'PC2': 'PC2'},
    opacity=0.4
)

fig.show()

In [102]:
fig = px.scatter(
    df_pca,
    x='PC1',
    y='PC2',
    color='True_Label', 
    symbol='Cluster',
    hover_data=['Cluster', 'True_Label'],
    title='K-Means Clusters Visualized',
    labels={'PC1': 'PC1', 'PC2': 'PC2'},
    opacity=0.4
)

fig.show()