In [None]:
import pandas as pd
import numpy as np

import nltk
import re
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')   # Important for new NLTK versions
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from google.colab import files
uploaded = files.upload()

df = pd.read_csv("twitter4000.csv")

print("Dataset Shape:", df.shape)
print(df.head())

Saving twitter4000.csv to twitter4000 (3).csv
Dataset Shape: (4000, 2)
                                              twitts  sentiment
0  is bored and wants to watch a movie  any sugge...          0
1           back in miami.  waiting to unboard ship           0
2  @misskpey awwww dnt dis brng bak memoriessss, ...          0
3                  ughhh i am so tired  blahhhhhhhhh          0
4  @mandagoforth me bad! It's funny though. Zacha...          0


In [None]:

df['twitts'] = df['twitts'].fillna("")

In [None]:
stop_words = set(stopwords.words('english'))

def preprocess_text(text):

    # Convert to lowercase
    text = str(text).lower()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)

    # Remove mentions
    text = re.sub(r'@\w+', '', text)

    # Remove # symbol only (keep word)
    text = re.sub(r'#', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # ======================
    # Tokenization
    # ======================
    tokens = word_tokenize(text)

    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]

    return " ".join(tokens)


In [None]:
df['processed_twitts'] = df['twitts'].apply(preprocess_text)

print("\nProcessed Sample:")
print(df[['twitts','processed_twitts']].head())



Processed Sample:
                                              twitts  \
0  is bored and wants to watch a movie  any sugge...   
1           back in miami.  waiting to unboard ship    
2  @misskpey awwww dnt dis brng bak memoriessss, ...   
3                  ughhh i am so tired  blahhhhhhhhh   
4  @mandagoforth me bad! It's funny though. Zacha...   

                                    processed_twitts  
0                bored wants watch movie suggestions  
1                    back miami waiting unboard ship  
2       dnt dis brng bak memoriessss thnk im sad lol  
3                           ughhh tired blahhhhhhhhh  
4  bad funny though zachary quinto though amp rep...  


In [None]:
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,2)
)

X = tfidf.fit_transform(df['processed_twitts'])
y = df['sentiment']

print("\nTF-IDF Shape:", X.shape)



TF-IDF Shape: (4000, 5000)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

print("\nTrain Shape:", X_train.shape)
print("Test Shape:", X_test.shape)



Train Shape: (3200, 5000)
Test Shape: (800, 5000)


In [None]:

sample_text = df['twitts'].iloc[0]

print("\nOriginal Text:")
print(sample_text)

print("\nProcessed Text:")
print(preprocess_text(sample_text))

print("\nTokens:")
print(word_tokenize(preprocess_text(sample_text)))



Original Text:
is bored and wants to watch a movie  any suggestions?

Processed Text:
bored wants watch movie suggestions

Tokens:
['bored', 'wants', 'watch', 'movie', 'suggestions']


In [None]:

feature_names = tfidf.get_feature_names_out()

print("\nSample TF-IDF Features:")
print(feature_names[:20])


Sample TF-IDF Features:
['aaaaaah' 'aaaall' 'aaaall day' 'aaahhh' 'aaahhh iris' 'aahhh'
 'aahhh ahahha' 'aargh' 'aargh tweets' 'aaron' 'aaron kinda'
 'aaron smashed' 'aarrrg' 'aarrrg fond' 'ab' 'able' 'able go' 'absolutely'
 'abt' 'ac']
