# Assignment 1

imports

In [103]:
import pandas as pd
import json
import re
import emoji
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet

In [104]:
nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\matti\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\matti\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\matti\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\matti\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

## Task 1

In [105]:
def dataset_from_json(name):
    from collections import Counter
    with open(name, 'r', encoding='utf-8') as f:
        data = json.load(f)
        df = pd.DataFrame.from_dict(data, orient='index')
        df = df[['id_EXIST', 'lang', 'tweet', 'labels_task2']]
        df = df.rename(columns={'labels_task2': 'label'})
        df = df[df['lang'] == 'en']

        def most_frequent_or_drop(arr):
            if not isinstance(arr, list):
                return arr
            c = Counter(arr)
            most_common = c.most_common()
            if len(most_common) == 0:
                return None
            max_count = most_common[0][1]
            candidates = [val for val,
                          count in most_common if count == max_count]
            if len(candidates) > 1:
                return None
            return candidates[0]

        df['label'] = df['label'].apply(most_frequent_or_drop)

        def map_label(label):
            mapping = {
                '-': 0,
                'DIRECT': 1,
                'JUDGEMENTAL': 2,
                'REPORTED': 3
            }
            return mapping.get(label, None)
        
        df = df.dropna(subset=['label'])
        df['label'] = df['label'].apply(map_label)

    return df

train_df = dataset_from_json('data/training.json')
val_df = dataset_from_json('data/validation.json')
test_df = dataset_from_json('data/test.json')

In [106]:
train_df.head()
# test_df.head()

Unnamed: 0,id_EXIST,lang,tweet,label
200001,200001,en,FFS! How about laying the blame on the bastard...,0
200002,200002,en,Writing a uni essay in my local pub with a cof...,3
200003,200003,en,@UniversalORL it is 2021 not 1921. I dont appr...,3
200004,200004,en,@GMB this is unacceptable. Use her title as yo...,0
200005,200005,en,‘Making yourself a harder target’ basically bo...,0


## Task 2

In [None]:
def text_preprocessing(df):
    def clean_text(text):
        # text = text.lower() # not required
        text = emoji.replace_emoji(text, replace='') # remove emojis

        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) # remove URLs
        text = re.sub(r'\@\w+|\#','', text) # remove mentions and hashtags
        text = re.sub(r'[^a-zA-Z\s]', '', text) # remove special characters
        text = re.sub(r'\s+', ' ', text) # remove extra spaces
        text = re.sub(r'\"\'\`\’\‘\“\”', ' ', text) # remove extra spaces
        return text

    # remove invalid chars
    df['tweet'] = df['tweet'].apply(clean_text)


    def lemmatize_text(text):
        lemmatizer = WordNetLemmatizer()
        
        def get_wordnet_pos(tag):
            if tag.startswith('J'):
                return wordnet.ADJ
            elif tag.startswith('V'):
                return wordnet.VERB
            elif tag.startswith('N'):
                return wordnet.NOUN
            elif tag.startswith('R'):
                return wordnet.ADV
            else:
                return wordnet.NOUN  # Default to noun if unknown

        words = text.split()
        pos_tags = pos_tag(words)
        lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
        return ' '.join(lemmatized_words)

    df['tweet'] = df['tweet'].apply(lemmatize_text)
    return df

train_df = text_preprocessing(train_df)
val_df = text_preprocessing(val_df)
test_df = text_preprocessing(test_df)

In [108]:
train_df.head()
# test_df.head()

Unnamed: 0,id_EXIST,lang,tweet,label
200001,200001,en,FFS How about lay the blame on the bastard who...,0
200002,200002,en,Writing a uni essay in my local pub with a cof...,3
200003,200003,en,it be not I dont appreciate that on two ride b...,3
200004,200004,en,this be unacceptable Use her title a you do fo...,0
200005,200005,en,Making yourself a hard target basically boil d...,0
