# Importing and combining datasets

In [1]:
# # install pandas
# !pip install pandas

import pandas as pd
combined_df = pd.DataFrame()

# local-new: text + label*
df = pd.read_csv(r'data/malaya/local-news.csv')
combined_df = pd.concat([combined_df, df[['text', 'label']].rename(columns={'label': 'sentiment'})])

# semisupervised-bert-xlnet: text + label*
df = pd.read_csv(r'data/malaya/semisupervised-bert-xlnet.csv')
combined_df = pd.concat([combined_df, df[['text', 'label']].rename(columns={'label': 'sentiment'})])

# semisupervised-bert-xlnet: text + label*
df = pd.read_csv(r'data/malaya/semisupervised-politics-bert-xlnet.csv')
combined_df = pd.concat([combined_df, df[['text', 'label']].rename(columns={'label': 'sentiment'})])

# supervised-data: text + sentiment* 
df = pd.read_csv(r'data/malaya/supervised-data.csv', sep='\t')
combined_df = pd.concat([combined_df, df[['text', 'sentiment']]])

# supervised-data-politics: text + sentiment* 
df = pd.read_csv(r'data/malaya/supervised-data-politics.csv', sep='\t')
combined_df = pd.concat([combined_df, df[['text', 'sentiment']]])

# supervised-data-politics: text + sentiment* 
df = pd.read_csv(r'data/malaya/manglish.csv', sep='\t')
df['sentiment'] = df['sentiment'].apply(lambda x: x.capitalize())
combined_df = pd.concat([combined_df, df[['text', 'sentiment']]])

# twitter: text + sentiment* 
df = pd.read_csv(r'data/scrapping/twitter.csv', sep='\t')
df['sentiment'] = df['sentiment'].apply(lambda x: x.capitalize())
combined_df = pd.concat([combined_df, df[['text', 'sentiment']]])

combined_df = combined_df[combined_df['text'].notnull()].reset_index()[['text', 'sentiment']]
combined_df

Unnamed: 0,text,sentiment
0,Lebih-lebih lagi dengan kemudahan internet da...,Negative
1,boleh memberi teguran kepada parti tetapi perl...,Positive
2,Adalah membingungkan mengapa masyarakat Cina b...,Negative
3,Kami menurunkan defisit daripada 6.7 peratus p...,Positive
4,"Ini masalahnya. Bukan rakyat, tetapi sistem",Negative
...,...,...
543662,Boxi in Alliance : 🤡\nBoxi in Liquid : 🗿,Positive
543663,Liquid tolong pause sat 🤣,Neutral
543664,The last time dine-in here was on the day @Nig...,Positive
543665,Meriah lower bracket liquid pun jatuh huhu,Negative


# Data preprocessing

## Data Cleaning

In [2]:
from unidecode import unidecode
import string
import re

punctuation = '‘’“”!$%&\()*+,./:;<=>?[\\]^_`{|}~•@…'

def clean_text(text):
    # convert characters to ascii
    text = unidecode(text)
    
    # remove words that is hashtags, mentions and links
    text = re.sub(r'^([@#]|http|https)[^\s]*', '', text)
    
    # remove punctuation
    text = text.translate(text.maketrans('', '', punctuation))
    
    # remove next line     
    text = re.sub('\n', '', text)
    
    # lowercasing text
    text = text.lower()
    
    # stripping text
    text = text.strip()
    
    # remove words containing numbers
    text = re.sub('\w*\d\w*', '', text)
    
    return text
    
combined_df['text'] = combined_df['text'].apply(lambda x: clean_text(x))
combined_df

Unnamed: 0,text,sentiment
0,lebih-lebih lagi dengan kemudahan internet da...,Negative
1,boleh memberi teguran kepada parti tetapi perl...,Positive
2,adalah membingungkan mengapa masyarakat cina b...,Negative
3,kami menurunkan defisit daripada peratus pada...,Positive
4,ini masalahnya bukan rakyat tetapi sistem,Negative
...,...,...
543662,boxi in alliance boxi in liquid,Positive
543663,liquid tolong pause sat,Neutral
543664,the last time dine-in here was on the day nigm...,Positive
543665,meriah lower bracket liquid pun jatuh huhu,Negative


## Normalise short-form words

In [4]:
malaya_sf = pd.read_csv(r'normalise/malaya.csv')
cilisos_sf = pd.read_csv(r'normalise/cilisos.csv', encoding='ISO-8859-1')

combined_sf = {x[0]: x[1] for x in malaya_sf.values.tolist() + cilisos_sf.values.tolist()}

def normalise_text(text):
    return ' '.join([combined_sf[x] if x in combined_sf.keys() else x for x in text.split()])

combined_df['text'] = combined_df['text'].apply(lambda x: normalise_text(x))
combined_df

Unnamed: 0,text,sentiment
0,lebih-lebih lagi dengan kemudahan internet dan...,Negative
1,boleh memberi teguran kepada parti tetapi perl...,Positive
2,adalah membingungkan mengapa masyarakat cina b...,Negative
3,kami menurunkan defisit daripada peratus pada ...,Positive
4,ini masalahnya bukan rakyat tetapi sistem,Negative
...,...,...
543662,boxi in alliance boxi in liquid,Positive
543663,liquid tolong pause sabtu,Neutral
543664,the akhir masa dine-in here was on the hari ni...,Positive
543665,meriah lower bracket liquid pun jatuh huhu,Negative


## Saving datasets

In [5]:
combined_df.to_csv(r'data/combined_data.csv', sep='\t', encoding='ISO-8859-1')