## AT1: Data Augmentation for Misogynistic Tweet Detection
Given a labelled tweet. Words in the labelled tweet are randomly replaced by semantically similar words from pretrained word vector space to create an artificial new tweet. The label of the new tweet as assigned to the label of the given tweet.

### Import Required libraries

In [1]:
import pandas as pd
from gensim.models import KeyedVectors
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
import random



### Define source and destination files of data

In [2]:
base = 'U:\\Research\\'
source = 'train_ini.csv'
destination = 'AT1.csv'

### Read dataset

In [3]:
df_in = pd.read_csv(base+source, encoding='utf8')
df_in.head()

Unnamed: 0,label,text
0,1,xrtu: bitch this is mater xurl
1,0,xatp xatp xatp xatp xatp \r\nmera desh badal r...
2,1,xrtu: act like a cunt 🤷🏻‍♂️ xurl
3,1,xrtu: bitch now is not the time xurl
4,0,xrtu: bitch i'm the boy who the fuck is you xurl


## Data preprocess section

In [4]:
tokenizer = RegexpTokenizer(r'\w+')
p_stemmer = PorterStemmer()

def preprocess_atweet(tweet):
    tokens = [p_stemmer.stem(t) for t in tokenizer.tokenize(tweet.lower())]
    return tokens

### Load pretrained word vectors

In [5]:
wv_loc = 'U:\\Research\\w2v\\vectors.txt'
model_cbow = KeyedVectors.load(wv_loc)

### Apply Data Augmentation Policy

In [6]:
random.seed(123)
def create_atweet(tweet, no_top_words, model):        
    new_tweet = []
    for term in tweet:
        if term in model.wv.vocab and random.random()>0.5:
            sim_words = model.wv.most_similar(positive=[term],topn=1)
            for sim_word in sim_words:
                new_tweet.append(sim_word[0])
        else:
            new_tweet.append(term)
    return ' '.join(new_tweet)

In [7]:
def inflade_tweet_set(df=None, no_top_words=1, fold=1, model=None):
    tweet_list = []
    for idx in df.index:
        label = df.iloc[idx]['label']
        tweet = preprocess_atweet(df.iloc[idx]['text'])
        row = {'label':label, 'text':' '.join(tweet)}
        tweet_list.append(row)
        for i in range(fold):
            new_tweet = create_atweet(tweet, no_top_words, model)
            new_row = {'label':label, 'text':new_tweet}
            tweet_list.append(new_row)
    return pd.DataFrame(tweet_list)

In [8]:
new_df = inflade_tweet_set(df=df_in, no_top_words=1, fold=3, model=model_cbow)

In [9]:
new_df.head()

Unnamed: 0,label,text
0,1,xrtu bitch thi is mater xurl
1,1,xrtu bitch thi is mater xurl
2,1,xrtu nigga thi s mater xurl
3,1,xrtu nigga thi is mater xurl
4,0,xatp xatp xatp xatp xatp mera desh badal raha ...


In [10]:
new_df.tail()

Unnamed: 0,label,text
5387,0,xatp piensen q era la asco fake
5388,0,bad lil bitch w her hip so curvi
5389,0,good ice bitch with her argumento so curvi
5390,0,bad ice nigga w my argumento so curvi
5391,0,good ice bitch with my hip too curvi


In [11]:
sf_df = new_df.sample(frac=1).reset_index(drop=True)

In [12]:
sf_df.head()

Unnamed: 0,label,text
0,1,0dbjtlvpzo he s not give anyon dgb autonòmiqu ...
1,0,xrtu eu amava questo um 2011 aiai que decepção...
2,1,xrtu yall sukisalvador to convinc peopl mcdona...
3,1,xatp whore jdhdjfjd
4,0,xatp listen to a load of shit untz untz dont c...


In [13]:
sf_df.tail()

Unnamed: 0,label,text
5387,0,ayuda jin va my lo david bowi
5388,1,xrtu yassss 더 xurl
5389,1,xrtu it don t me bitch it s u
5390,1,xatp i don drm3 kidfuckingpanda gzycqbr5xf you...
5391,0,like idk when thi hoe got so funni but i m rol...


## Write augmented data to destination

In [14]:
sf_df.to_csv(base+destination, encoding='utf8', index=None)