"""
twitter_cleaning_project.py
Clean and profile the Twitter_Data.csv sentiment dataset
========================================================
Author : <Parshvi Akkewar>
Date   : 2025-06-01

Key steps
---------
1. Load & validate raw data (data-integrity)
2. Handle missing values
3. Remove duplicate rows
4. Standardise text & label formats
5. Detect & treat outliers (tweet length)
6. Quick EDA & save cleaned data
"""


In [8]:
# Import Libraries

import pandas as pd

In [10]:
# Load the dataset
df = pd.read_csv("Twitter_Data.csv")

In [12]:
# 1. Data Integrity: Check for required columns and datatypes
print("Initial Info:")
print(df.info())


Initial Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162980 entries, 0 to 162979
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   clean_text  162976 non-null  object 
 1   category    162973 non-null  float64
dtypes: float64(1), object(1)
memory usage: 2.5+ MB
None


In [14]:
# 2. Missing Data Handling: Remove rows with missing values
df = df.dropna()


In [18]:
# 3. Duplicate Removal: Drop duplicate rows
df = df.drop_duplicates()

In [20]:
# 4. Standardization: Strip whitespaces from text and ensure category is numeric
df['clean_text'] = df['clean_text'].str.strip()
df['category'] = pd.to_numeric(df['category'], errors='coerce')


In [22]:
# Keep only valid categories: -1, 0, 1
df = df[df['category'].isin([-1, 0, 1])]


In [24]:
# 5. Outlier Detection: Identify outliers in tweet length
df['text_length'] = df['clean_text'].apply(len)
Q1 = df['text_length'].quantile(0.25)
Q3 = df['text_length'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
df = df[(df['text_length'] >= lower) & (df['text_length'] <= upper)]


In [26]:
# Drop the helper column
df = df.drop(columns=['text_length'])

In [28]:
# Save the cleaned dataset
df.to_csv("Twitter_Data_Cleaned.csv", index=False)

print("\n✅ Data cleaning complete. Cleaned data saved as 'Twitter_Data_Cleaned.csv'")



✅ Data cleaning complete. Cleaned data saved as 'Twitter_Data_Cleaned.csv'
