# Importing and combining datasets

In [None]:
# # install pandas
# !pip install pandas

import pandas as pd
combined_df = pd.DataFrame()

# local-new: text + label*
df = pd.read_csv(r'dmalaya/local-news.csv')
combined_df = pd.concat([combined_df, df[['text', 'label']].rename(columns={'label': 'sentiment'})])

# semisupervised-bert-xlnet: text + label*
df = pd.read_csv(r'malaya/semisupervised-bert-xlnet.csv')
combined_df = pd.concat([combined_df, df[['text', 'label']].rename(columns={'label': 'sentiment'})])

# semisupervised-bert-xlnet: text + label*
df = pd.read_csv(r'malaya/semisupervised-politics-bert-xlnet.csv')
combined_df = pd.concat([combined_df, df[['text', 'label']].rename(columns={'label': 'sentiment'})])

# supervised-data: text + sentiment* 
df = pd.read_csv(r'malaya/supervised-data.csv', sep='\t')
combined_df = pd.concat([combined_df, df[['text', 'sentiment']]])

# supervised-data-politics: text + sentiment* 
df = pd.read_csv(r'malaya/supervised-data-politics.csv', sep='\t')
combined_df = pd.concat([combined_df, df[['text', 'sentiment']]])

# supervised-data-politics: text + sentiment* 
df = pd.read_csv(r'malaya/manglish.csv', sep='\t')
df['sentiment'] = df['sentiment'].apply(lambda x: x.capitalize())
combined_df = pd.concat([combined_df, df[['text', 'sentiment']]])

# twitter: text + sentiment* 
df = pd.read_csv(r'scrapping/twitter.csv', sep='\t')
df['sentiment'] = df['sentiment'].apply(lambda x: x.capitalize())
combined_df = pd.concat([combined_df, df[['text', 'sentiment']]])

combined_df = combined_df[combined_df['text'].notnull()].reset_index()[['text', 'sentiment']]
combined_df

# Data preprocessing

## Data Cleaning

In [None]:
from unidecode import unidecode
import string
import re

punctuation = '‘’“”!$%&\()*+,./:;<=>?[\\]^_`{|}~•@…'

def clean_text(text):
    # convert characters to ascii
    text = unidecode(text)
    
    # remove words that is hashtags, mentions and links
    text = re.sub(r'^([@#]|http|https)[^\s]*', '', text)
    
    # remove punctuation
    text = text.translate(text.maketrans('', '', punctuation))
    
    # remove next line     
    text = re.sub('\n', '', text)
    
    # lowercasing text
    text = text.lower()
    
    # stripping text
    text = text.strip()
    
    # remove words containing numbers
    text = re.sub('\w*\d\w*', '', text)
    
    return text
    
combined_df['text'] = combined_df['text'].apply(lambda x: clean_text(x))
combined_df

## Normalise short-form words

In [None]:
malaya_sf = pd.read_csv(r'../normalise/malaya.csv')
cilisos_sf = pd.read_csv(r'../normalise/cilisos.csv', encoding='ISO-8859-1')

combined_sf = {x[0]: x[1] for x in malaya_sf.values.tolist() + cilisos_sf.values.tolist()}

def normalise_text(text):
    return ' '.join([combined_sf[x] if x in combined_sf.keys() else x for x in text.split()])

combined_df['text'] = combined_df['text'].apply(lambda x: normalise_text(x))
combined_df

## Clean null and meaningless values

In [None]:
combined_df.to_csv(r'combined_data.csv', sep='\t', encoding='ISO-8859-1')
combined_df = pd.read_csv(r'combined_data.csv', sep='\t', encoding='ISO-8859-1')

# filter text that is at least 5 words and not null
combined_df = combined_df[combined_df['text'].apply(lambda x: type(x) is str and len(x.split()) > 5)].reset_index()[['text', 'sentiment']]

# filter sentiment that is not null
combined_df = combined_df[combined_df['sentiment'].apply(lambda x: type(x) is str)].reset_index()[['text', 'sentiment']]

combined_df

## Saving datasets

In [None]:
combined_df.to_csv(r'combined_data.csv', sep='\t', encoding='ISO-8859-1')

## Sample datasets

In [2]:
import pandas as pd
combined_df = pd.read_csv(r'combined_data.csv', sep='\t', encoding='ISO-8859-1')

sampled_df = combined_df.groupby('sentiment').sample(n=50000).sample(frac=1)
sampled_df = sampled_df.reset_index()[['text', 'sentiment']]

sampled_df.to_csv(r'sampled_data.csv', sep='\t', encoding='ISO-8859-1')
sampled_df

Unnamed: 0,text,sentiment
0,maybe he knew how banyak saya loves u things s...,Negative
1,betulkan kerajaan sekarang di tanah air sendiri,Negative
2,jadi this is what saya meant,Negative
3,selangortv pakej kita selangor bukti prihatin ...,Negative
4,lepas itu dia papp papp paap appl yang dahulu ...,Negative
...,...,...
149995,habis lepas ini hendak buang mase dekat mana p...,Negative
149996,mp bn umum kekal sokong pm muhyiddinyassin wal...,Negative
149997,untuk pandan sk tasik permai ampang selangor##,Neutral
149998,can't imagine pokka now like wear green hat,Neutral
