In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


In [3]:
import gensim.downloader as api
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import re
from gensim.models import Word2Vec, FastText
import matplotlib.pyplot as plt
import seaborn as sns

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Part 1: Word2Vec Exploration with Pretrained Model
def part1_word2vec_exploration():
    # Load pretrained Word2Vec model
    print("Loading pretrained Word2Vec model...")
    w2v_model = api.load('word2vec-google-news-300')
    
    # Task 1: Find similar words for 5 chosen words
    words = ['computer', 'love', 'car', 'school', 'music']
    print("\nSimilar words for selected words:")
    for word in words:
        try:
            similar_words = w2v_model.most_similar(word, topn=5)
            print(f"\nWord: {word}")
            print("Similar words:", [(w, round(score, 4)) for w, score in similar_words])
        except KeyError:
            print(f"\nWord: {word} not in vocabulary")
    
    # Task 2: Test vector arithmetic (similar to king - man + woman ~= queen)
    analogies = [
        ('king', 'man', 'woman', 'queen'),
        ('paris', 'france', 'italy', 'rome'),
        ('big', 'bigger', 'small', 'smaller')
    ]
    print("\nTesting vector arithmetic analogies:")
    for w1, w2, w3, expected in analogies:
        try:
            result = w2v_model.most_similar(positive=[w1, w3], negative=[w2], topn=1)
            print(f"{w1} - {w2} + {w3} ~= {result[0][0]} (expected: {expected})")
        except KeyError as e:
            print(f"Word not in vocabulary: {e}")

# Part 2: Movie Review Sentiment Classifier
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    return tokens

def get_document_vector(tokens, model, model_type='w2v'):
    vectors = []
    for token in tokens:
        try:
            if model_type == 'w2v' or model_type == 'fasttext':
                vectors.append(model[token])
            else:  # For gensim Word2Vec/FastText
                vectors.append(model.wv[token])
        except KeyError:
            continue
    if vectors:
        return np.mean(vectors, axis=0)
    return np.zeros(300)  # Return zero vector if no valid tokens

def train_and_evaluate(X, y, model_name):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    return {
        'Model': model_name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1-Score': f1_score(y_test, y_pred)
    }

def part2_sentiment_classifier():
    # Load IMDB dataset
    print("\nLoading IMDB dataset...")
    df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
    df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
    
    # Task 1: Text EDA
    df['review_length'] = df['review'].apply(lambda x: len(word_tokenize(x)))
    print("\nEDA - Review Length Statistics:")
    print(df['review_length'].describe())
    
    plt.figure(figsize=(10, 6))
    sns.histplot(df['review_length'], bins=50)
    plt.title('Distribution of Review Lengths')
    plt.xlabel('Number of Words')
    plt.ylabel('Frequency')
    plt.savefig('review_length_distribution.png')
    plt.close()
    
    # Task 2: Clean text
    print("\nCleaning text data...")
    df['tokens'] = df['review'].apply(clean_text)
    
    # Task 3: Train models with different embeddings
    results = []
    
    # 3.1 Pretrained Word2Vec
    print("Loading pretrained Word2Vec model for sentiment analysis...")
    w2v_model = api.load('word2vec-google-news-300')
    X_w2v = np.array([get_document_vector(tokens, w2v_model, 'w2v') for tokens in df['tokens']])
    results.append(train_and_evaluate(X_w2v, df['sentiment'], 'Pretrained Word2Vec'))
    
    # 3.2 Custom Skip-gram
    print("Training custom Skip-gram model...")
    skipgram_model = Word2Vec(sentences=df['tokens'], vector_size=300, window=5, min_count=5, sg=1, workers=4)
    X_skipgram = np.array([get_document_vector(tokens, skipgram_model, 'custom') for tokens in df['tokens']])
    results.append(train_and_evaluate(X_skipgram, df['sentiment'], 'Custom Skip-gram'))
    
    # 3.3 Custom CBOW
    print("Training custom CBOW model...")
    cbow_model = Word2Vec(sentences=df['tokens'], vector_size=300, window=5, min_count=5, sg=0, workers=4)
    X_cbow = np.array([get_document_vector(tokens, cbow_model, 'custom') for tokens in df['tokens']])
    results.append(train_and_evaluate(X_cbow, df['sentiment'], 'Custom CBOW'))
    
    # 3.4 Custom FastText
    print("Training custom FastText model...")
    fasttext_model = FastText(sentences=df['tokens'], vector_size=300, window=5, min_count=5, workers=4)
    X_fasttext = np.array([get_document_vector(tokens, fasttext_model, 'custom') for tokens in df['tokens']])
    results.append(train_and_evaluate(X_fasttext, df['sentiment'], 'Custom FastText'))
    
    # Task 4: Tabulate results
    results_df = pd.DataFrame(results)
    print("\nModel Performance Statistics:")
    print(results_df)
    results_df.to_csv('model_performance.csv', index=False)

if __name__ == "__main__":
    part1_word2vec_exploration()
    part2_sentiment_classifier()

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Loading pretrained Word2Vec model...

Similar words for selected words:

Word: computer
Similar words: [('computers', 0.7979), ('laptop', 0.664), ('laptop_computer', 0.6549), ('Computer', 0.6473), ('com_puter', 0.6082)]

Word: love
Similar words: [('loved', 0.6908), ('adore', 0.6817), ('loves', 0.6619), ('passion', 0.6101), ('hate', 0.6004)]

Word: car
Similar words: [('vehicle', 0.7821), ('cars', 0.7424), ('SUV', 0.7161), ('minivan', 0.6907), ('truck', 0.6736)]

Word: school
Similar words: [('elementary', 0.7869), ('schools', 0.7412), ('shool', 0.6692), ('elementary_schools', 0.6597), ('kindergarten', 0.653)]

Word: music
Similar words: [('classical_music', 0.7198), ('jazz', 0.6835), ('Music', 0.6596), ('Without_Donny_Kirshner', 0.6416), ('songs', 0.6396)]

Testing vector arithmetic analogies:
king - man + woman ~= queen (expected: queen)
paris - france + italy ~= lohan (expected: rome)
big - bigger + small ~= large (expected: smaller)

Loading IMDB dataset...

EDA - Review Length Sta

  with pd.option_context('mode.use_inf_as_na', True):



Cleaning text data...
Loading pretrained Word2Vec model for sentiment analysis...
Training custom Skip-gram model...
Training custom CBOW model...
Training custom FastText model...

Model Performance Statistics:
                 Model  Accuracy  Precision    Recall  F1-Score
0  Pretrained Word2Vec    0.8529   0.853828  0.854336  0.854082
1     Custom Skip-gram    0.8850   0.883002  0.889661  0.886319
2          Custom CBOW    0.8714   0.867149  0.879540  0.873300
3      Custom FastText    0.8575   0.854036  0.865053  0.859509
