In [1]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
from gensim.models import Word2Vec

In [2]:
# Download required NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Load the dataset from a local file
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1', header=None)
df.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']

print(df.head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\theat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\theat\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\theat\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


   target         ids                          date      flag  \
0       0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY   
1       0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   
2       0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY   
3       0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
4       0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   

              user                                               text  
0  _TheSpecialOne_  @switchfoot http://twitpic.com/2y1zl - Awww, t...  
1    scotthamilton  is upset that he can't update his Facebook by ...  
2         mattycus  @Kenichan I dived many times for the ball. Man...  
3          ElleCTF    my whole body feels itchy and like its on fire   
4           Karoli  @nationwideclass no, it's not behaving at all....  


In [3]:
# Function to clean the text data
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    # Remove user mentions and hashtags
    text = re.sub(r'\@\w+|\#','', text)
    # Remove punctuations and numbers
    text = re.sub(r'[^A-Za-z\s]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words
    tokens = [word for word in tokens if word.lower() not in stopwords.words('english')]
    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

# Preprocess the text column
tqdm.pandas(desc="Preprocessing")
df['clean_text'] = df['text'].progress_apply(preprocess_text)

Preprocessing: 100%|██████████| 1600000/1600000 [1:02:09<00:00, 428.98it/s]


In [5]:
df.to_csv('preprocessed_sentiment140.csv', index=False)

In [24]:
sentences = df['clean_text'].tolist()
# Train the Word2Vec model
model = Word2Vec(sentences, vector_size=300, window=7, min_count=10, workers=4)

In [25]:
model.build_vocab(sentences, update=True)

In [26]:
model.train(sentences, total_examples=model.corpus_count, epochs=5)

(50235230, 57400830)

In [29]:
model.train(sentences, total_examples=model.corpus_count, epochs=10)

(100469513, 114801660)

In [44]:
model.save('word2vec.model')