# Data Preprocessing

### Import relevant libraries

In [394]:
import pandas as pd
import datetime
import numpy as np
from sklearn.preprocessing import MinMaxScaler

### Import and build news data

In [395]:
news_data = pd.read_csv('Dawn_News.txt', names = ['Date', 'News'], index_col = 0) #Original raw data file

In [396]:
news_data['Polarity'] = [0 for i in range(len(news_data))] #Initialize polarities to zero

In [397]:
news_data.drop_duplicates(subset ="News", 
                     keep = False, inplace = True) 
  

In [398]:
news_data.reset_index(drop=True, inplace=True)

In [399]:
news_data.head(30)

Unnamed: 0,Date,News,Polarity
0,2011-04-01,Judgment on judges` extension case suspended,0
1,2011-04-01,Gilani hints at giving subsidy on oil prices,0
2,2011-04-01,Agencies averse to aerial survey by foreigners,0
3,2011-04-01,SC gives PPP leaders time to file reply in con...,0
4,2011-04-01,"Quake measuring 5.2 rattles Islamabad, Mansehra",0
5,2011-04-01,Pakistan ready to hand suspected militant to I...,0
6,2011-04-01,PPP always respected judiciary: Sharjeel Memon,0
7,2011-04-01,Gilani to consult politicians on fuel price hike,0
8,2011-04-01,Land reforms vital for women empowerment,0
9,2011-04-01,Bone marrow recipients highlight consequences ...,0


In [400]:
news_data['Date'] = pd.to_datetime(news_data['Date']) #Convert to datetime
news_data['Date'] = news_data["Date"].dt.strftime("%d %m %Y") #Format date
news_data['News'].map(str) #Convert news to string

0             Judgment on judges` extension case suspended
1             Gilani hints at giving subsidy on oil prices
2           Agencies averse to aerial survey by foreigners
3        SC gives PPP leaders time to file reply in con...
4          Quake measuring 5.2 rattles Islamabad, Mansehra
                               ...                        
23001    Tabuk governor arrives in Dalbandin to hunt ho...
23002    Broadsheet commission to probe other cases as ...
23003    Opposition slams move to bring Senate polls bi...
23004    SC issues notice to NAB on Zardari’s pleas for...
23005    Japan to provide $4.57m grant to help procure ...
Name: News, Length: 23006, dtype: object

In [401]:
news_data.tail()

Unnamed: 0,Date,News,Polarity
23001,28 01 2021,Tabuk governor arrives in Dalbandin to hunt ho...,0
23002,28 01 2021,Broadsheet commission to probe other cases as ...,0
23003,28 01 2021,Opposition slams move to bring Senate polls bi...,0
23004,28 01 2021,SC issues notice to NAB on Zardari’s pleas for...,0
23005,28 01 2021,Japan to provide $4.57m grant to help procure ...,0


### Import and stock prices as daily difference

In [402]:
diff = pd.read_csv('Companies diff\ABOT PA Equity.csv')

In [403]:
diff.describe()

Unnamed: 0,Open,High,Low,Close,Volume
count,5157.0,5157.0,5157.0,5157.0,5157.0
mean,0.147057,0.149578,0.145137,0.147993,5.805895
std,8.392596,6.892363,7.040883,6.328886,62090.918356
min,-88.8844,-58.9713,-41.2265,-45.0148,-925733.0
25%,-0.7633,-0.8252,-0.6606,-0.7319,-3408.0
50%,0.0,0.0,0.0,0.0,0.0
75%,0.8546,0.8822,0.8114,0.7571,3200.0
max,83.7564,43.7413,52.9888,52.1436,918700.0


In [404]:
diff['Date'] = pd.to_datetime(diff['Date']) #Convert date to datetime



In [405]:
diff.describe()

Unnamed: 0,Open,High,Low,Close,Volume
count,5157.0,5157.0,5157.0,5157.0,5157.0
mean,0.147057,0.149578,0.145137,0.147993,5.805895
std,8.392596,6.892363,7.040883,6.328886,62090.918356
min,-88.8844,-58.9713,-41.2265,-45.0148,-925733.0
25%,-0.7633,-0.8252,-0.6606,-0.7319,-3408.0
50%,0.0,0.0,0.0,0.0,0.0
75%,0.8546,0.8822,0.8114,0.7571,3200.0
max,83.7564,43.7413,52.9888,52.1436,918700.0


### Make relevant functions to map polarities and take into account weekends

In [406]:
def pol(val, mean, std):
    print(val)
    if val >= mean + std:
        
        return 1
    
    elif val < mean-std:
        
        return -1
    
    else:
        
        return 0
    


# Year, month, day	
def check(x):
    d3 = datetime.datetime.strptime(x, '%d %m %Y')
    d2 = datetime.datetime.strptime(x, '%d %m %Y').weekday()
    #print(d3)
    #print(d2)

    if (d2 == 5):
        d1 = d3 - datetime.timedelta(days = 1)
    elif (d2 == 6):
        d1 = d3 - datetime.timedelta(days = 2)
    else:
        d1 = d3
    #print(d1)    
    return d1


check('08 02 2009')

datetime.datetime(2009, 2, 6, 0, 0)

### Label data with polarity

In [407]:
print(len(news_data))
mean = diff['Open'].mean()
std = diff['Open'].std()
for i in range(22543):
    #print(i)
    #print(news_data['Date'][23])
    date = check(news_data['Date'][i])
    #print(date)
    index = diff.index[diff['Date'] == date]
    #print(index)
    #print(int(diff['Close'][index]))
    polarity = pol(float(diff['Open'][index]), mean, std)
    #print(i)
    news_data['Polarity'][i] = polarity
    

23006
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.6696000000000026
-0.66960000000

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


0.30630000000000024
0.30630000000000024
0.30630000000000024
0.30630000000000024
0.30630000000000024
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.09969999999999857
-0.099699999999998

In [408]:
news_data.describe()

Unnamed: 0,Polarity
count,23006.0
mean,0.041511
std,0.527915
min,-1.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [409]:
count_pos = news_data['Polarity'].value_counts()[1]
count_ntl = news_data['Polarity'].value_counts()[0]
count_neg = news_data['Polarity'].value_counts()[-1]
print(count_pos, count_ntl, count_neg)

3703 16555 2748


### Save labelled data

In [410]:
news_data.to_csv(path_or_buf = 'C:/Users/CZ/Kaavish/Dawn_News_Labelled' + '.csv', index = False) #Save dataframe as CSV
    