## Load Libraries Needed

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from datetime import datetime
import re
from sklearn.feature_extraction import text

## Part1: Sentimental Analysis on Textual Data

## Read Textual Data

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
indian_news = pd.read_csv('drive/MyDrive/india-news-headlines.csv')
# shape of the data
indian_news.shape


(3650970, 3)

## Explore the Textual Data

In [4]:
indian_news.head(15)

Unnamed: 0,publish_date,headline_category,headline_text
0,20010102,unknown,Status quo will not be disturbed at Ayodhya; s...
1,20010102,unknown,Fissures in Hurriyat over Pak visit
2,20010102,unknown,America's unwanted heading for India?
3,20010102,unknown,For bigwigs; it is destination Goa
4,20010102,unknown,Extra buses to clear tourist traffic
5,20010102,unknown,Dilute the power of transfers; says Riberio
6,20010102,unknown,Focus shifts to teaching of Hindi
7,20010102,unknown,IT will become compulsory in schools
8,20010102,unknown,Move to stop freedom fighters' pension flayed
9,20010102,unknown,Gilani claims he applied for passport 2 years ago


In [5]:
indian_news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3650970 entries, 0 to 3650969
Data columns (total 3 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   publish_date       int64 
 1   headline_category  object
 2   headline_text      object
dtypes: int64(1), object(2)
memory usage: 83.6+ MB


## Preprocessing for the Indian News Headlines Data

In [6]:
# Delete unnecessary column
indian_news.drop(columns=['headline_category'],inplace=True)

In [7]:
# convert the publish_data column to datetime datatype
indian_news['publish_date'] = indian_news['publish_date'].map(lambda x : str(x))
indian_news['publish_date'] = indian_news['publish_date'].map(lambda x : datetime.strptime(x,'%Y%m%d'))

In [8]:
# convert headline text to lowercase
indian_news['headline_text'] = indian_news['headline_text'].map(lambda x : x.lower())
# remove all special characters 
indian_news['headline_text'] = indian_news['headline_text'].map(lambda x : re.sub("[^a-zA-Z']",' ',x))
# load english stopwords
stopwords = text.ENGLISH_STOP_WORDS
# remove english stopwords
indian_news['headline_text'] = indian_news['headline_text'].map(lambda x : [word for word in x.split() if word not in stopwords])
# convert the output list above into a string
indian_news['headline_text'] = indian_news['headline_text'].map(lambda x : " ".join(x))

In [9]:
# explore the textual data after preprocessing
indian_news.head(15)

Unnamed: 0,publish_date,headline_text
0,2001-01-02,status quo disturbed ayodhya says vajpayee
1,2001-01-02,fissures hurriyat pak visit
2,2001-01-02,america's unwanted heading india
3,2001-01-02,bigwigs destination goa
4,2001-01-02,extra buses clear tourist traffic
5,2001-01-02,dilute power transfers says riberio
6,2001-01-02,focus shifts teaching hindi
7,2001-01-02,compulsory schools
8,2001-01-02,stop freedom fighters' pension flayed
9,2001-01-02,gilani claims applied passport years ago
