# **1**

# **Install and Import Required Libraries**

In [None]:
!pip install datasets scikit-learn




In [None]:
import re
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer


# **Upload the CSV File**

In [None]:
from google.colab import files
uploaded = files.upload()


Saving IMDB_Dataset[1].csv to IMDB_Dataset[1].csv


# **Load CSV Using Pandas**

In [None]:
import pandas as pd

df = pd.read_csv("IMDB_Dataset[1].csv")
df.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# **Confirm  Columns**

In [None]:
df.columns


Index(['review', 'sentiment'], dtype='object')

# **Extract Only the Review Text Column**

In [None]:
texts = df["review"]
print(texts.head())      # Print First 5 Reviews
print("------------------------------")
print(texts.iloc[0])     # Print Only ONE Review
print("-------------------------------")
print(texts.iloc[100])    # Print Any Specific Review (Example: 100th Review)
print("-------------------------------")
print(len(texts))         # Check How Many Reviews Are There


0    One of the other reviewers has mentioned that ...
1    A wonderful little production. <br /><br />The...
2    I thought this was a wonderful way to spend ti...
3    Basically there's a family where a little boy ...
4    Petter Mattei's "Love in the Time of Money" is...
Name: review, dtype: object
------------------------------
One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all th

# **Preprocessing (Lowercase + Remove Punctuation + Tokenize)**

In [None]:
import re

def preprocess(text):
    text = text.lower()                     # convert to lowercase
    text = re.sub(r"[^\w\s]", "", text)     # remove punctuation
    tokens = text.split()                  # tokenization
    return " ".join(tokens)

processed_texts = texts.apply(preprocess)

print(processed_texts.head())        #Print First 5 Preprocessed Reviews
print("------------------------------")
print(processed_texts.iloc[0])       #Print Only the First Preprocessed Review
print("-------------------------------")
print("Original Review:\n", texts.iloc[0])       # Compare Before vs After

print("\nPreprocessed Review:\n", processed_texts.iloc[0])

print("-------------------------------")


0    one of the other reviewers has mentioned that ...
1    a wonderful little production br br the filmin...
2    i thought this was a wonderful way to spend ti...
3    basically theres a family where a little boy j...
4    petter matteis love in the time of money is a ...
Name: review, dtype: object
------------------------------
one of the other reviewers has mentioned that after watching just 1 oz episode youll be hooked they are right as this is exactly what happened with mebr br the first thing that struck me about oz was its brutality and unflinching scenes of violence which set in right from the word go trust me this is not a show for the faint hearted or timid this show pulls no punches with regards to drugs sex or violence its is hardcore in the classic use of the wordbr br it is called oz as that is the nickname given to the oswald maximum security state penitentary it focuses mainly on emerald city an experimental section of the prison where all the cells have glass fronts 

# **Apply CountVectorizer (Bag of Words)**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(processed_texts)


# **Print the Final Feature Matrix Dimensions**

In [None]:
print("Feature Matrix Shape:", X.shape)


Feature Matrix Shape: (50000, 181018)


# **2**

# **Apply TfidfVectorizer**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(processed_texts)


# **Check TF-IDF Matrix Dimensions**

In [None]:
print("TF-IDF Matrix Shape:", X_tfidf.shape)


TF-IDF Matrix Shape: (50000, 181018)


# **Get All Feature (Word) Names**

In [None]:
feature_names = tfidf_vectorizer.get_feature_names_out()


# **Find Top 10 Words with Highest TF-IDF Scores**

In [None]:
import numpy as np

# Get maximum TF-IDF value for each word
max_tfidf_scores = X_tfidf.max(axis=0).toarray().flatten()

# Get indices of Top 10 highest TF-IDF scores
top_10_indices = max_tfidf_scores.argsort()[-10:][::-1]

# Get Top 10 words and their scores
top_10_words = [(feature_names[i], max_tfidf_scores[i]) for i in top_10_indices]

# Print the result
for word, score in top_10_words:
    print(word, ":", score)


trivialboring : 0.9903091228374733
pokemon : 0.915922533317921
ghoulies : 0.9077695563114262
blahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblah : 0.8798445561624949
doodlebops : 0.873900428668498
esperanto : 0.8728332232131242
scanners : 0.8700643860864498
ernest : 0.8693541552626862
smallville : 0.8669931598214625
uzumakis : 0.8648082100303551


# **3**

# **Upload File to Colab**

In [None]:
from google.colab import files
files.upload()


# **Confirm File Is Uploaded**

In [None]:
import os
os.listdir()


['.config', 'IMDB_Dataset[1].csv', 'glove.6B.100d[1].txt', 'sample_data']

# **Now You Can Load It**

In [None]:
glove_path = "glove.6B.100d[1].txt"


# **Load GloVe Embeddings**

In [None]:
import numpy as np

glove_path = "glove.6B.100d[1].txt"

embeddings = {}

with open(glove_path, "r", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.array(values[1:], dtype="float32")
        embeddings[word] = vector

print("Total words loaded:", len(embeddings))


Total words loaded: 400000


# **Define a Function for Analogy Task**

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def find_analogy(word1, word2, word3, embeddings, top_n=5):
    # Check all words exist
    if word1 not in embeddings or word2 not in embeddings or word3 not in embeddings:
        print("One of the words is not in the vocabulary.")
        return

    # Vector arithmetic
    result_vector = embeddings[word1] - embeddings[word2] + embeddings[word3]

    similarities = {}

    for word in embeddings:
        sim = cosine_similarity(
            result_vector.reshape(1, -1),
            embeddings[word].reshape(1, -1)
        )[0][0]
        similarities[word] = sim

    # Sort by similarity
    best_words = sorted(similarities.items(), key=lambda x: x[1], reverse=True)

    return best_words[:top_n]


# **Perform the Required Analogy**

In [None]:
result = find_analogy("teacher", "educate", "heal", embeddings)

for word, score in result:
    print(word, ":", score)


teacher : 0.5776843
burns : 0.56647384
broken : 0.5609497
surgery : 0.5605513
dies : 0.55698824


# **4**

# **Install & Import Libraries**

In [None]:
!pip install nltk gensim


Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m73.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [None]:
import nltk
from nltk.corpus import brown
from gensim.models import Word2Vec


# **Download Brown Corpus**

In [None]:
nltk.download("brown")
nltk.download("punkt")


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# **Load Brown Corpus Text Data**

In [None]:
sentences = brown.sents()
print("Total Sentences in Brown Corpus:", len(sentences))


Total Sentences in Brown Corpus: 57340


# **Preprocessing (Tokenization + Lowercase)**

In [None]:
processed_sentences = [[word.lower() for word in sent] for sent in sentences]

print(processed_sentences[0])


['the', 'fulton', 'county', 'grand', 'jury', 'said', 'friday', 'an', 'investigation', 'of', "atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']


# **Train CBOW Model**

In [None]:
cbow_model = Word2Vec(
    sentences=processed_sentences,
    vector_size=100,
    window=5,
    min_count=5,
    workers=4,
    sg=0    # CBOW
)


# **Train Skip-gram Model**

In [None]:
skipgram_model = Word2Vec(
    sentences=processed_sentences,
    vector_size=100,
    window=5,
    min_count=5,
    workers=4,
    sg=1    # Skip-gram
)


# **Most Similar Words to "king"**

In [None]:
print("CBOW Similar Words to 'king':")
print(cbow_model.wv.most_similar("king", topn=5))

print("\nSkip-gram Similar Words to 'king':")
print(skipgram_model.wv.most_similar("king", topn=5))


CBOW Similar Words to 'king':
[('frank', 0.9354960322380066), ('wendell', 0.9315876364707947), ('sister', 0.918956995010376), ('poet', 0.9177730679512024), ('judge', 0.9112087488174438)]

Skip-gram Similar Words to 'king':
[('eisenhower', 0.8978859186172485), ('pope', 0.8926117420196533), ('edward', 0.8923366069793701), ('vice', 0.8867267966270447), ('williams', 0.8843581676483154)]


# **Similarity Score Between Two Words**

In [None]:
print("CBOW Similarity (doctor, nurse):",
      cbow_model.wv.similarity("doctor", "nurse"))

print("Skip-gram Similarity (doctor, nurse):",
      skipgram_model.wv.similarity("doctor", "nurse"))


CBOW Similarity (doctor, nurse): 0.72223556
Skip-gram Similarity (doctor, nurse): 0.74731755


# **SAME ANALOGY TASK FROM PREVIOUS QUESTION (GloVe)**

# **CBOW Analogy Test**

In [None]:
cbow_analogy = cbow_model.wv.most_similar(
    positive=["teacher", "medical"],
    negative=["educate"],
    topn=5
)

print("CBOW Analogy Result:")
for word, score in cbow_analogy:
    print(word, ":", score)


CBOW Analogy Result:
trade : 0.8662232160568237
junior : 0.8546879887580872
barley : 0.8505846261978149
corporation : 0.8477445244789124
product : 0.843788206577301


# **Skip-gram Analogy Test**

In [None]:
skipgram_analogy = skipgram_model.wv.most_similar(
    positive=["teacher", "medical"],
    negative=["educate"],
    topn=5
)

print("\nSkip-gram Analogy Result:")
for word, score in skipgram_analogy:
    print(word, ":", score)




Skip-gram Analogy Result:
college : 0.7611891031265259
department : 0.7467976808547974
education : 0.7391571998596191
student : 0.7147482633590698
health : 0.7076064348220825


# **Classic Analogy Test (Extra Proof)**

In [None]:
print("CBOW: King - Man + Woman")
print(cbow_model.wv.most_similar(
    positive=["king", "woman"],
    negative=["man"],
    topn=3
))

print("\nSkip-gram: King - Man + Woman")
print(skipgram_model.wv.most_similar(
    positive=["king", "woman"],
    negative=["man"],
    topn=3
))


CBOW: King - Man + Woman
[('sister', 0.9228566884994507), ('quaker', 0.9150481820106506), ('wendell', 0.9150415658950806)]

Skip-gram: King - Man + Woman
[('daughter', 0.8299257159233093), ('queen', 0.8027129769325256), ('harris', 0.7932974100112915)]
