In [None]:
pip install pandas nltk emoji scikit-learn matplotlib seaborn

In [None]:
# !wget http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
# !unzip trainingandtestdata.zip

In [None]:
import pandas as pd
import re

file_path = "/Users/chaitanyam/Downloads/trainingandtestdata/training.1600000.processed.noemoticon.csv"

df = pd.read_csv(file_path, encoding="ISO-8859-1", header=None)
df.columns = ['target', 'id', 'date', 'flag', 'user', 'text']
df['label'] = df['target'].map({0: 'negative', 4: 'positive'})

def clean_text(text):
    if pd.isna(text):
        return ""
    
    text = str(text)
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r"<.*?>", "", text)  # HTML tags
    text = re.sub(r"@\w+|#\w+", "", text)  # mentions and hashtags
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # emojis
    text = re.sub(r"[^a-zA-Z0-9\s.,!?;:]", "", text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower().strip()
    
    return text

df['clean_text'] = df['text'].apply(clean_text)
df = df.drop_duplicates(subset=['clean_text'])
df = df[df['clean_text'].str.strip() != '']
df = df.dropna(subset=['clean_text'])

print(f"\nData cleaned: {len(df):,} samples\n")

print("Some of the Cleaned Tweets:\n")
print(df[['text', 'clean_text', 'label']].sample(4, random_state=42).to_string(index=False))

In [None]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)


df['word_tokens'] = df['clean_text'].apply(word_tokenize)
df['word_count'] = df['word_tokens'].apply(len)


df['sentence_tokens'] = df['clean_text'].apply(sent_tokenize)
df['sentence_count'] = df['sentence_tokens'].apply(len)


df[['clean_text', 'word_tokens', 'word_count', 'sentence_tokens', 'sentence_count']].head()

In [None]:
lemmatizer = WordNetLemmatizer()

def lemmatize(text):
    tokens = nltk.word_tokenize(text)
    return " ".join([lemmatizer.lemmatize(token) for token in tokens])

df['lemmatized_text'] = df['clean_text'].apply(lemmatize)

print("Sample Lemmatized Tweets:\n")
print(df[['clean_text', 'lemmatized_text', 'label']].sample(4, random_state=42).to_string(index=False))

In [None]:
from collections import Counter

df['text_length'] = df['clean_text'].str.len()
all_words = ' '.join(df['clean_text']).split()
vocab_size = len(set(all_words))
total_words = len(all_words)

print("DATASET STATISTICS")
print(f"Total samples: {len(df):,}")
print(f"Unique texts: {df['clean_text'].nunique():,}")

label_counts = df['label'].value_counts()
for label, count in label_counts.items():
    percentage = count / len(df) * 100
    print(f"{label.capitalize()}: {count:,} ({percentage:.1f}%)")

print(f"Average text length: {df['text_length'].mean():.1f} characters")
print(f"Average word count: {df['word_count'].mean():.1f}")
print(f"Vocabulary size: {vocab_size:,} words")

word_freq = Counter(all_words)
print("Most common words:")
for i, (word, freq) in enumerate(word_freq.most_common(5), 1):
    print(f"{i}. '{word}': {freq:,}")



In [None]:
from sklearn.model_selection import train_test_split

X = df['clean_text']
y = df['label']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

print(f"Data splits created:")
print(f"Training: {len(X_train):,} ({len(X_train)/len(df)*100:.1f}%)")
print(f"Validation: {len(X_val):,} ({len(X_val)/len(df)*100:.1f}%)")
print(f"Test: {len(X_test):,} ({len(X_test)/len(df)*100:.1f}%)")

for split_name, split_labels in [('Training', y_train), ('Validation', y_val), ('Test', y_test)]:
    neg_pct = (split_labels == 'negative').sum() / len(split_labels) * 100
    pos_pct = (split_labels == 'positive').sum() / len(split_labels) * 100
    print(f"{split_name}: {neg_pct:.1f}% negative, {pos_pct:.1f}% positive")

train_df = pd.DataFrame({'text': X_train.values, 'label': y_train.values})
val_df = pd.DataFrame({'text': X_val.values, 'label': y_val.values})
test_df = pd.DataFrame({'text': X_test.values, 'label': y_test.values})

train_df.to_csv("train.csv", index=False)
val_df.to_csv("val.csv", index=False)
test_df.to_csv("test.csv", index=False)

print(f"Files saved:")
print(f"train.csv: {len(train_df):,} samples")
print(f"val.csv: {len(val_df):,} samples")
print(f"test.csv: {len(test_df):,} samples")
print("Processing completed")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Label counts and percentages
label_counts = df['label'].value_counts()
label_percent = df['label'].value_counts(normalize=True) * 100

print("Sentiment Label Distribution:\n")
for label in label_counts.index:
    print(f"{label.capitalize()}: {label_counts[label]:,} tweets ({label_percent[label]:.2f}%)")

print("Average Tweet Length (in characters) per Sentiment:\n")
for label in df['label'].unique():
    avg_length = df[df['label'] == label]['clean_text'].str.len().mean()
    print(f"{label.capitalize()}: {avg_length:.1f} characters")

plt.figure(figsize=(6, 4))
sns.countplot(x='label', data=df, hue='label', palette='Set2', dodge=False, legend=False)
plt.title("Label Distribution")
plt.xlabel("Sentiment")
plt.ylabel("Count")
plt.show()