## Import Libraries


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


## Importing Dataset from Kaggle


In [2]:
import os
import json
from dotenv import load_dotenv

load_dotenv('../config.env')

True

In [3]:
data = {'username': os.getenv('KAGGLE_USERNAME'), 'key': os.getenv('KAGGLE_KEY')}

In [4]:
os.environ['KAGGLE_USERNAME'] = data['username']
os.environ['KAGGLE_KEY'] = data['key']

In [5]:
from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()

api.dataset_download_files('dineshpiyasamara/sentiment-analysis-dataset', path='../Data')

In [6]:
import zipfile
with zipfile.ZipFile('../Data/sentiment-analysis-dataset.zip', 'r') as zipfile:
    zipfile.extractall('../Data')

In [7]:
dataset = pd.read_csv('../Data/sentiment_analysis.csv')

dataset.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [8]:
dataset.shape

(7920, 3)

In [9]:
dataset.duplicated().sum() # Check is there any duplicated values

0

In [10]:
dataset.isnull().sum()

id       0
label    0
tweet    0
dtype: int64

## Remove non-valued characters(numbers, symbols) - TEXT PREPARATION


In [11]:
import re
import string

### Convert Uppercases to lowercase


In [12]:
dataset['tweet'] = dataset['tweet'].apply(lambda x: " ".join(x.lower() for x in x.split()))

### Remove links


In [13]:
dataset['tweet'] = dataset['tweet'].apply(lambda x: " ".join(re.sub(r'^https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE) for x in x.split()))

### Remove punchuations


In [14]:
def remove_punchuations(text):
    for punchuation in string.punctuation:
        text = text.replace(punchuation, '')
    return text

dataset['tweet'] = dataset['tweet'].apply(remove_punchuations)

### Remove Numbers


In [15]:
dataset['tweet'] = dataset['tweet'].str.replace('d+', '', regex=True)

### Remove stop-words (am, is, are)


In [18]:
import nltk

nltk.download('stopwords', download_dir='../Data/static/model')

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>


False

In [19]:
with open('../Data/static/model/stopwords/english', 'r') as file:
    stopwords = file.read().splitlines()

In [21]:
dataset['tweet'] = dataset['tweet'].apply(lambda x: ' '.join(x for x in x.split() if x not in stopwords))

### Stemming (Base word)


In [23]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()

In [24]:
dataset['tweet'] = dataset['tweet'].apply(lambda x: " ".join(ps.stem(x) for x in x.split()))

In [25]:
dataset.head()

Unnamed: 0,id,label,tweet
0,1,0,fingerprint pregnanc test anroi app beauti cut...
1,2,0,final transpar silicon case thank uncl yay son...
2,3,0,love woul go talk makememori unplug relax ipho...
3,4,0,im wire know im georg mae way iphon cute avent...
4,5,1,amaz servic appl wont even talk question unles...
