In [6]:
import pandas as pd


df = pd.read_csv(r"C:\Users\NILESH\Desktop\IMDB Dataset.csv")


print(df.head())


                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [7]:
import re

def clean_text(text):
    text = text.lower() 
    text = re.sub(r'\d+', '', text)  
    text = re.sub(r'[^\w\s]', '', text)  
    text = re.sub(r'\s+', ' ', text).strip()  
    return text

df['cleaned_review'] = df['review'].apply(clean_text)
print(df[['review', 'cleaned_review']].head())


                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                      cleaned_review  
0  one of the other reviewers has mentioned that ...  
1  a wonderful little production br br the filmin...  
2  i thought this was a wonderful way to spend ti...  
3  basically theres a family where a little boy j...  
4  petter matteis love in the time of money is a ...  


In [8]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

df['tokens'] = df['cleaned_review'].apply(word_tokenize)
print(df[['cleaned_review', 'tokens']].head())


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\NILESH\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                      cleaned_review  \
0  one of the other reviewers has mentioned that ...   
1  a wonderful little production br br the filmin...   
2  i thought this was a wonderful way to spend ti...   
3  basically theres a family where a little boy j...   
4  petter matteis love in the time of money is a ...   

                                              tokens  
0  [one, of, the, other, reviewers, has, mentione...  
1  [a, wonderful, little, production, br, br, the...  
2  [i, thought, this, was, a, wonderful, way, to,...  
3  [basically, theres, a, family, where, a, littl...  
4  [petter, matteis, love, in, the, time, of, mon...  


In [9]:
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])
print(df[['cleaned_review', 'tokens']].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\NILESH\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


                                      cleaned_review  \
0  one of the other reviewers has mentioned that ...   
1  a wonderful little production br br the filmin...   
2  i thought this was a wonderful way to spend ti...   
3  basically theres a family where a little boy j...   
4  petter matteis love in the time of money is a ...   

                                              tokens  
0  [one, reviewers, mentioned, watching, oz, epis...  
1  [wonderful, little, production, br, br, filmin...  
2  [thought, wonderful, way, spend, time, hot, su...  
3  [basically, theres, family, little, boy, jake,...  
4  [petter, matteis, love, time, money, visually,...  


In [11]:
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

df['tokens'] = df['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x] if isinstance(x, list) else [])

print(df[['cleaned_review', 'tokens']].head())


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\NILESH\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\NILESH\AppData\Roaming\nltk_data...


                                      cleaned_review  \
0  one of the other reviewers has mentioned that ...   
1  a wonderful little production br br the filmin...   
2  i thought this was a wonderful way to spend ti...   
3  basically theres a family where a little boy j...   
4  petter matteis love in the time of money is a ...   

                                              tokens  
0  [one, reviewer, mentioned, watching, oz, episo...  
1  [wonderful, little, production, br, br, filmin...  
2  [thought, wonderful, way, spend, time, hot, su...  
3  [basically, there, family, little, boy, jake, ...  
4  [petter, matteis, love, time, money, visually,...  


In [12]:
df['processed_text'] = df['tokens'].apply(lambda x: ' '.join(x))
print(df[['review', 'processed_text']].head())


                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                      processed_text  
0  one reviewer mentioned watching oz episode you...  
1  wonderful little production br br filming tech...  
2  thought wonderful way spend time hot summer we...  
3  basically there family little boy jake think t...  
4  petter matteis love time money visually stunni...  


In [13]:
df.to_csv('cleaned_imdb_reviews.csv', index=False)
print("Cleaned dataset saved successfully!")


Cleaned dataset saved successfully!


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

documents = ["This movie is fantastic", "I hated this film", "Amazing story and great acting"]

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))  
X = vectorizer.fit_transform(documents)


print(vectorizer.get_feature_names_out())


['acting' 'amazing' 'amazing story' 'and' 'and great' 'fantastic' 'film'
 'great' 'great acting' 'hated' 'hated this' 'is' 'is fantastic' 'movie'
 'movie is' 'story' 'story and' 'this' 'this film' 'this movie']


In [1]:
import gensim.downloader as api

word2vec_model = api.load("word2vec-google-news-300")

vector = word2vec_model["amazing"]
print(vector.shape) 


(300,)
