# Data Exploration
This notebook loads and explore the dataset

## Imports

In [1]:
!pip install -q textblob
!python -m textblob.download_corpora
import nltk
nltk.download('stopwords')

[0m[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
Finished.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from random import randint
import re
from nltk.corpus import stopwords
from textblob.classifiers import NaiveBayesClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

separator = 100 * '-'

## Cleanup csv file
Some rows have multiple text headers separated as tabs, which are treated as multiple columns when loaded into a dataframe.  
We run a cleaning pass on the csv file and  merge them to get only one text per row  
**[label TAB text]**.

In [3]:
import csv

def clean_tabs_and_filter_csv(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', newline='', encoding='utf-8') as outfile:
        reader = csv.reader(infile, delimiter='\t')
        writer = csv.writer(outfile, delimiter='\t')
        
        for row in reader:
            if len(row) >= 2:
                # The first tab separates the label from the text, keep that.
                label = row[0]
                text = row[1]
                
                # Replace tabs in the text part with spaces
                clean_text = text.replace('\t', ' ')
                
                # Write the cleaned row to the new file
                writer.writerow([label, clean_text])

# Usage
input_file = 'training_data_lowercase.csv'
output_file = 'training_data_clean.csv'

clean_tabs_and_filter_csv(input_file, output_file)

print("File cleaned and filtered. Saved as", output_file)


File cleaned and filtered. Saved as training_data_clean.csv


## Load dataset

In [7]:
import pandas as pd
# file_path = "../data/TRAINING_DATA.txt"
file_path = "training_data_lowercase.csv"
data = pd.read_csv(file_path, sep='\t', header=None, names=['label', 'text'])
data = data[['text', 'label']]
print(data.shape)
data.head(10)


(34152, 2)


Unnamed: 0,text,label
0,Donald Trump Sends Out Embarrassing New Year‚Ä...,0
1,Drunk Bragging Trump Staffer Started Russian C...,0
2,Sheriff David Clarke Becomes An Internet Joke ...,0
3,Trump Is So Obsessed He Even Has Obama‚Äôs Nam...,0
4,Pope Francis Just Called Out Donald Trump Duri...,0
5,Racist Alabama Cops Brutalize Black Boy While ...,0
6,Fresh Off The Golf Course,0
7,Trump Said Some INSANELY Racist Stuff Inside T...,0
8,Former CIA Director Slams Trump Over UN Bullying,0
9,WATCH: Brand-New Pro-Trump Ad Features So Much...,0


In [4]:
X = data["text"]
y = data["label"]

def print_text(feature, label, idx=None):
    try:
        print(separator)
        if idx is None:
            idx = randint(0, feature.shape[0])
        print(f"[{idx}]", feature[idx], "-->", label[idx])
        print(separator)
    except:
        print("Can't print email contents.")

print_text(X, y)

----------------------------------------------------------------------------------------------------
[1904] Ver El Teen Mom Cuelgue Con Su Hija y fiesta con sus amigas AQUÍ ! --> 0
----------------------------------------------------------------------------------------------------


## Cleaning/Preprocessing dataset

In [5]:
def clean_text(text):

    # Remove special characters
    text = re.sub(r'[^A-Za-zÁÉÍÓÚáéíóúÑñ\s]', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Substitute multiple spaces with single space
    text = re.sub(r'\s+', ' ', text)

    # Convert to lowercase
    text = text.lower().strip()

    return text

# apply cleaning
X_clean = X.apply(clean_text)

idx = randint(0, len(X))
print_text(X, y, idx)
print_text(X_clean, y, idx)

    

----------------------------------------------------------------------------------------------------
[11374] Eso es algo que quiero decir muy bruscamente justo al principio . --> 1
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
[11374] eso es algo que quiero decir muy bruscamente justo al principio --> 1
----------------------------------------------------------------------------------------------------


## Remove Stopwords

In [6]:
stopwords_sp = stopwords.words("spanish")
print(len(stopwords_sp), "spanish stopwords")  # 313 stopwords for the spanish language

def remove_stopwords(text):

    # tokenize the text by splitting on spaces
    tokens = text.split()
    tokens = [word for word in tokens if word not in stopwords_sp]
    return ' '.join(tokens)

# apply removing stopwords
X_clean = X_clean.apply(remove_stopwords)

# check results
idx = randint(0, len(X))
print_text(X, y, idx)
print_text(X_clean, y, idx)


313 spanish stopwords
----------------------------------------------------------------------------------------------------
[9250] En las palabras inmortales del patriota escocés Mel Gibson , " usted puede tomar sus vidas , pero nunca tomar sus pompones ! --> 1
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
[9250] palabras inmortales patriota escocés mel gibson usted puede tomar vidas nunca tomar pompones --> 1
----------------------------------------------------------------------------------------------------


## Split Dataset

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_clean, y, test_size=0.3, random_state=42, stratify=data['label'])

# reset indexes
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# print shapes
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(10446,) (10446,)
(4478,) (4478,)
