# Data Preprocessing

### Import relevant libraries

In [215]:
import pandas as pd
import datetime

### Import and build news data

In [216]:
news_data = pd.read_csv('Dawn_News.txt', names = ['Date', 'News'], index_col = 0) #Original raw data file

In [217]:
news_data['Polarity'] = [0 for i in range(len(news_data))] #Initialize polarities to zero

In [218]:
news_data.head()

Unnamed: 0,Date,News,Polarity
0,2011-04-01,Judgment on judges` extension case suspended,0
1,2011-04-01,Gilani hints at giving subsidy on oil prices,0
2,2011-04-01,Agencies averse to aerial survey by foreigners,0
3,2011-04-01,SC gives PPP leaders time to file reply in con...,0
4,2011-04-01,"Quake measuring 5.2 rattles Islamabad, Mansehra",0


In [219]:
news_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28603 entries, 0 to 28602
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Date      28603 non-null  object
 1   News      28601 non-null  object
 2   Polarity  28603 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 893.8+ KB


In [220]:
news_data['Date'] = pd.to_datetime(news_data['Date']) #Convert to datetime
news_data['Date'] = news_data["Date"].dt.strftime("%d %m %Y") #Format date
news_data['News'].map(str) #Convert news to string

0             Judgment on judges` extension case suspended
1             Gilani hints at giving subsidy on oil prices
2           Agencies averse to aerial survey by foreigners
3        SC gives PPP leaders time to file reply in con...
4          Quake measuring 5.2 rattles Islamabad, Mansehra
                               ...                        
28598        Pakistan to stop manual visas from next month
28599                  Pakistan, Kuwait vow to expand ties
28600                    ‘Digital Pakistan coming to life’
28601    Pakistan grab late wickets to take command aga...
28602    Pakistan women take on SA in T20 series commen...
Name: News, Length: 28603, dtype: object

In [221]:
news_data.tail()

Unnamed: 0,Date,News,Polarity
28598,28 01 2021,Pakistan to stop manual visas from next month,0
28599,28 01 2021,"Pakistan, Kuwait vow to expand ties",0
28600,28 01 2021,‘Digital Pakistan coming to life’,0
28601,28 01 2021,Pakistan grab late wickets to take command aga...,0
28602,28 01 2021,Pakistan women take on SA in T20 series commen...,0


### Import and stock prices as daily difference

In [250]:
diff = pd.read_csv('Companies diff\ABL PA Equity.csv')

In [251]:
diff.describe()

Unnamed: 0,Open,High,Low,Close,Volume
count,3951.0,3951.0,3951.0,3951.0,3951.0
mean,0.02061,0.02073,0.019534,0.020362,10.30979
std,1.031389,0.972631,0.901209,0.861705,1192969.0
min,-10.4344,-7.8013,-6.19,-5.9452,-41244280.0
25%,-0.3021,-0.2935,-0.26955,-0.2826,-74850.0
50%,0.0,0.0,0.0,0.0,0.0
75%,0.3344,0.32595,0.3122,0.29235,66292.0
max,7.9964,6.32,8.0841,5.5877,43250890.0


In [252]:
diff['Date'] = pd.to_datetime(diff['Date']) #Convert date to datetime

In [253]:
diff.head()

Unnamed: 0,Open,High,Low,Close,Volume,Date
0,0.0,0.0,0.0,0.0,0.0,2005-08-18
1,4.798,-0.9625,0.9255,0.0,-21684.0,2005-08-19
2,0.0,0.148,0.1481,0.0889,57398.0,2005-08-22
3,0.0889,-0.074,-0.2073,-0.0592,-31888.0,2005-08-23
4,-0.0592,-0.0074,0.0963,-0.1481,-8928.0,2005-08-24


### Make relevant functions to map polarities and take into account weekends

In [257]:
def pol(val):
    #print(val)
    if val >= 0:
        
        return 1
    
    else:
        
        return 0
    


# Year, month, day	
def check(x):
    d3 = datetime.datetime.strptime(x, '%d %m %Y')
    d2 = datetime.datetime.strptime(x, '%d %m %Y').weekday()
   # print(d3)
    ##print(d2)

    if (d2 == 5):
        d1 = d3 - datetime.timedelta(days = 1)
    elif (d2 == 6):
        d1 = d3 - datetime.timedelta(days = 2)
    else:
        d1 = d3
    #print(d1)    
    return d1


check('08 02 2009')

datetime.datetime(2009, 2, 6, 0, 0)

### Label data with polarity

In [258]:
for i in range(28000):
    #print(i)
    #print(news_data['Date'][i])
    date = check(news_data['Date'][i])
    #print(date)
    index = diff.index[diff['Date'] == date]
    #print(index)
    #print(int(diff['Close'][index]))
    polarity = pol(float(diff['Close'][index]))
    news_data['Polarity'][i] = polarity
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [259]:
news_data.describe()

Unnamed: 0,Polarity
count,28603.0
mean,0.532182
std,0.498972
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


### Save labelled data

In [261]:
news_data.to_csv(path_or_buf = 'E:/Fall 2020/Kaavish/suffwan randi' + '.csv', index = False) #Save dataframe as CSV
    