# Clean data before labelling

In [3]:
import pandas as pd
import re

def clean():

    input_file = './raw_scraped/2022-07.csv'
    output_file = './cleaned/2022-07-cleaned.csv'

    # remove dup
    df = pd.read_csv(input_file, index_col=False, delimiter=",")
    df.drop_duplicates(subset=['text', 'username'])

    dfk = pd.read_csv("./keys.txt", sep=';')
    Words = dfk['Words'].values

    tweets_list = []
    how_many = 0
    found = 0


    # insert 'target' column
    df.insert(3, 'target', " ", allow_duplicates=True)
    df.to_csv(output_file, index=False, sep=',')

    for i in range(len(df)):

        # remove URLs and mentions
        df.at[i, 'text'] = re.sub(r"(?:\@|https?\://)\S+", '', df.at[i, 'text'], flags=re.MULTILINE)

        # remove new lines
        if df.at[i, 'text'].endswith("\n") or df.at[i, 'text'].endswith("\r"):
            df.at[i, 'text'] = df.at[i, 'text'].replace("\n", "").replace("\r", "")
        else:
            df.at[i, 'text'] = df.at[i, 'text'].replace("\n", " ").replace("\r", " ")

        # remove multiple spaces
        df.at[i, 'text'] = re.sub('\\s+', ' ', df.at[i, 'text'])

        # remove tweets without keywords
        for word in Words:

            found = 0

            if word in df.at[i, 'text'].lower():

                found = 1

                tweets_list.append(
                    [df.at[i, 'datetime'], df.at[i, 'text'], df.at[i, 'username'], df.at[i, 'target']])

                how_many = how_many + 1

                if i % 100 == 0:
                    print("Tweets filtered: ", how_many, "\t\tdate: ", df.at[i, 'datetime'])

                break
        if found == 0:
            print("Deleted: ", df.at[i, 'text'])

    tweets_df = pd.DataFrame(tweets_list, columns=['datetime', 'text', 'username', 'target'])
    tweets_df.to_csv(output_file, index=False, sep=',')
    print(tweets_df.shape)

## - Remove tweets with few occurences

In [None]:
import os
import pandas as pd

def remove_few_occurences():
    
    data = pd.read_csv("./cleaned/2021-12-cleaned.csv")
    white_list = ["pelato", "pelata", "nano", "nana", "obeso", "obesa", "cozza",
                 "ciccione", "grasso", "grassa"]
    black_list = ["boiler", "anoressica", "anoressico", "cicciona", "nasone", 
                 "racchia", "culona", "obesa"]

    tweets_list = []
    black = False 
    white = False
    deleted = 0

    print(data.shape)

    for i in range(len(data)):

        if any(word in data.at[i, 'Text'].lower() for word in black_list):
            black = True
        if any(word in data.at[i, 'Text'].lower() for word in white_list):
            white = True

        if black and not white:
            deleted+=1
            #print(deleted)
            #print(data.at[i, 'Text'])
            continue
        else:
            tweets_list.append(
                [data.at[i, 'Datetime'], data.at[i, 'Text'], 
                 data.at[i, 'Username']])   

            #tweets_df.to_csv("./labeled/prova-2.csv", index=False, sep=',')
        black = False 
        white = False

    print(tweets_df.shape)
    print(deleted)
    tweets_df = pd.DataFrame(tweets_list, columns=['Datetime', 'Text', 'Username'])
    tweets_df.to_csv("./cleaned/2021-12-filtered.csv", index=False, sep=',')

# Extract labelled tweets

In [None]:
import pandas as pd

def extract_labeled(in_file, out_file):

    df = pd.read_csv(in_file, index_col=False, delimiter=",")

    tweets_list = []
    print(df.shape)

    labeled = df[(df.target == '0') | (df.target == '1')]
    labeled.to_csv(out_file, index=False, sep=',')

    return (labeled.shape)

## - Verify balanced classes

In [64]:
import pandas as pd

def verify_balanced():
    data = pd.read_csv('./labeled/12-01-rebalanced-only-labeled.csv', index_col=False, delimiter=",")

    tot0 = data.query("target == 0").shape[0]
    tot1 = data.query("target == 1").shape[0]

    print("Tot 0 --> ", tot0)
    print("Tot 1 --> ", tot1)

# Preprocessing

## - Remove punctuation marks, brackets, quotes, special characters

In [None]:
import string

def remove_punctuation(text):
    text = "".join([i for i in str(text) if i not in string.punctuation])
    text = text.replace('\u201D', " ")
    text = text.replace('\u2018', " ")
    text = text.replace('\u2019', " ")
    text = text.replace('\u201c', " ")
    text = text.replace('\u2026', " ")
    text = re.sub(r'\.{2,}', ' ', text)

    return text

## - Text reformat

In [None]:
import re
def text_reformat(text):
    # remove two or more dots
    text = re.sub(r'\.{2,}', ' ', text)
    # remove two or more letters: { bellooooo -> bello}
    text = re.sub(r'(.)\1+', r'\1\1', text)

    return text.lower()

## - Remove emoticons

In [None]:
def remove_emojis(data):
    emoj = re.compile("["
                      u"\U0001F600-\U0001F64F"  # emoticons
                      u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                      u"\U0001F680-\U0001F6FF"  # transport & map symbols
                      u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                      u"\U00002500-\U00002BEF"  # chinese char
                      u"\U00002702-\U000027B0"
                      u"\U00002702-\U000027B0"
                      u"\U000024C2-\U0001F251"
                      u"\U0001f926-\U0001f937"
                      u"\U00010000-\U0010ffff"
                      u"\u2640-\u2642"
                      u"\u2600-\u2B55"
                      u"\u200d"
                      u"\u23cf"
                      u"\u23e9"
                      u"\u231a"
                      u"\ufe0f"  # dingbats
                      u"\u3030"
                      "]+", re.UNICODE)

    return re.sub(emoj, '', data)

## - Preprocessing call

In [None]:
def preprocessing_steps(data):
    new_data = remove_punctuation(data)
    new_data = text_reformat(new_data)
    new_data = remove_emojis(new_data)

    return new_data


def preprocess(data):

    array = []

    for index, tweet in data.iterrows():

        # print(tweet['Text'])
        new_data = preprocessing_steps(tweet['text'])
        array.append(new_data)
        # print(new_data, "\n")

    data['text'] = array

    print("Preprocessing done")

    return data

# Elaboration

## - Stopwords removal

In [None]:
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize


def remove_stopwords(tokens):
    # nltk.download('stopwords')
    stop_words = set(stopwords.words('italian'))
    
    return [i for i in tokens if i not in stop_words]

## - Stemming

In [None]:
from nltk.stem import SnowballStemmer

def stem(tokens):
    # the stemmer requires a language parameter
    snow_stemmer = SnowballStemmer(language='italian')

    return [snow_stemmer.stem(word) for word in tokens]

## - Remove miningless words

In [None]:
def miningfull_words(stemmed):
    return [word for word in stemmed if len(word) > 2]

## - Remove features with numbers

In [None]:
def remove_numbers(mean_words):
    return [word for word in mean_words if not word.isdigit()]

## - Elaboration call

In [None]:
from nltk.tokenize import word_tokenize

def textual(text):
    tweets = ""
    for word in text:
        tweets += word + " "

    return tweets


def elaborating_steps(t):
    # print(t)

    tokens = word_tokenize(t)
    # print(tokens)
    tokens = remove_stopwords(tokens)
    # print(tokens)
    stemmed_words = stem(tokens)
    # print(stemmed_words)
    mean_words = miningfull_words(stemmed_words)
    # print(mean_words)
    numbers_removed = remove_numbers(mean_words)
    # print(numbers_removed)
    # print("\n")

    elaborated_tweet = ""
    for word in numbers_removed:
        elaborated_tweet += word + " "

    return elaborated_tweet


def elaborate(data):

    elaborated = []

    for index, tweet in data.iterrows():

        # print(tweet['text'])
        new_tweet = elaborating_steps(tweet['text']).strip()
        elaborated.append(new_tweet)
        data.at[index, 'text'] = new_tweet

    # print(elaborated)
    data = data.sort_values(by='datetime')

    print("Elaboration done")
    print("\n")

    return data