In [4]:
import pandas as pd

df = pd.read_csv("twitter_training.csv")

def print_dataframe_stats(df):
    print("--- DataFrame Statistics ---")
    print(f"Total Rows: {df.shape[0]}")
    print(f"Total Columns: {df.shape[1]}")
    
    # duplicates
    duplicates_count = df.duplicated().sum()
    print(f"Duplicate Rows: {duplicates_count} ({duplicates_count / len(df):.2%})")
    
    # missing values
    total_missing = df.isna().sum().sum()
    print(f"Total Missing Values: {total_missing}")
    
    # empty rows (all values are NaN)
    empty_rows = df.isna().all(axis=1).sum()
    print(f"Completely Empty Rows: {empty_rows}")
    
    # memory usage
    print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024 ** 2:.2f} MB")
    print("----------------------------")


print_dataframe_stats(df)

--- DataFrame Statistics ---
Total Rows: 74681
Total Columns: 4
Duplicate Rows: 2700 (3.62%)
Total Missing Values: 686
Completely Empty Rows: 0
Memory Usage: 23.85 MB
----------------------------


In [5]:
def load_and_setup_data(df) -> pd.DataFrame:    
    # First column appers to be some sort file or sequence number and the second appears to be the source
    # Those two columns do not have any impact on sentiment analysis
    # Take the last two columns
    df = df.iloc[:, -2:]
    # Swap columns 1 and 2
    df = df[[df.columns[1], df.columns[0]]]
    # Setup column names
    df.columns = ["tweet", "sentiment"]
    # Remove empty rows
    df = df.dropna()
    # Remove where sentiment is "Irrelevant"
    df = df[df["sentiment"] != "Irrelevant"]
    # Remove diuplicate rows
    df = df.drop_duplicates()
    # Look for tweets where the same tweet is classified as a different sentiment
    # Take the first occurence - this will get us clean data and will not mislead the classifier later during training
    df = df.drop_duplicates(subset=["tweet"], keep="first")
    return df

df = load_and_setup_data(df)

print_dataframe_stats(df)

--- DataFrame Statistics ---
Total Rows: 57296
Total Columns: 2
Duplicate Rows: 0 (0.00%)
Total Missing Values: 0
Completely Empty Rows: 0
Memory Usage: 16.18 MB
----------------------------
