# Importing and combining datasets

In [None]:
# # install pandas
# !pip install pandas

import pandas as pd
combined_df = pd.DataFrame()

# local-new: text + label*
df = pd.read_csv(r'dmalaya/local-news.csv')
combined_df = pd.concat([combined_df, df[['text', 'label']].rename(columns={'label': 'sentiment'})])

# semisupervised-bert-xlnet: text + label*
df = pd.read_csv(r'malaya/semisupervised-bert-xlnet.csv')
combined_df = pd.concat([combined_df, df[['text', 'label']].rename(columns={'label': 'sentiment'})])

# semisupervised-bert-xlnet: text + label*
df = pd.read_csv(r'malaya/semisupervised-politics-bert-xlnet.csv')
combined_df = pd.concat([combined_df, df[['text', 'label']].rename(columns={'label': 'sentiment'})])

# supervised-data: text + sentiment* 
df = pd.read_csv(r'malaya/supervised-data.csv', sep='\t')
combined_df = pd.concat([combined_df, df[['text', 'sentiment']]])

# supervised-data-politics: text + sentiment* 
df = pd.read_csv(r'malaya/supervised-data-politics.csv', sep='\t')
combined_df = pd.concat([combined_df, df[['text', 'sentiment']]])

# supervised-data-politics: text + sentiment* 
df = pd.read_csv(r'malaya/manglish.csv', sep='\t')
df['sentiment'] = df['sentiment'].apply(lambda x: x.capitalize())
combined_df = pd.concat([combined_df, df[['text', 'sentiment']]])

# twitter: text + sentiment* 
df = pd.read_csv(r'scrapping/twitter.csv', sep='\t')
df['sentiment'] = df['sentiment'].apply(lambda x: x.capitalize())
combined_df = pd.concat([combined_df, df[['text', 'sentiment']]])

combined_df = combined_df[combined_df['text'].notnull()].reset_index()[['text', 'sentiment']]
combined_df

# Data preprocessing

## Data Cleaning

In [None]:
from unidecode import unidecode
import string
import re

punctuation = '‘’“”!$%&\()*+,./:;<=>?[\\]^_`{|}~•@…'

def clean_text(text):
    # convert characters to ascii
    text = unidecode(text)
    
    # remove words that is hashtags, mentions and links
    text = re.sub(r'^([@#]|http|https)[^\s]*', '', text)
    
    # remove punctuation
    text = text.translate(text.maketrans('', '', punctuation))
    
    # remove next line     
    text = re.sub('\n', '', text)
    
    # lowercasing text
    text = text.lower()
    
    # stripping text
    text = text.strip()
    
    # remove words containing numbers
    text = re.sub('\w*\d\w*', '', text)
    
    return text
    
combined_df['text'] = combined_df['text'].apply(lambda x: clean_text(x))
combined_df

## Normalise short-form words

In [None]:
malaya_sf = pd.read_csv(r'../normalise/malaya.csv')
cilisos_sf = pd.read_csv(r'../normalise/cilisos.csv', encoding='ISO-8859-1')

combined_sf = {x[0]: x[1] for x in malaya_sf.values.tolist() + cilisos_sf.values.tolist()}

def normalise_text(text):
    return ' '.join([combined_sf[x] if x in combined_sf.keys() else x for x in text.split()])

combined_df['text'] = combined_df['text'].apply(lambda x: normalise_text(x))
combined_df

## Saving datasets

In [None]:
combined_df.to_csv(r'combined_data.csv', sep='\t', encoding='ISO-8859-1')