# Data Preprocessing

### Import relevant libraries

In [1]:
import pandas as pd
import datetime

### Import and build news data

In [2]:
news_data = pd.read_csv('BR_News.txt', names = ['Date', 'News'], index_col = 0) #Original raw data file

In [3]:
news_data['Polarity'] = [0 for i in range(len(news_data))] #Initialize polarities to zero

In [4]:
news_data.drop_duplicates(subset ="News", 
                     keep = False, inplace = True) 
  

In [5]:
news_data.head()

Unnamed: 0,Date,News,Polarity
,date,news,0
0.0,2011-04-02,Political parties should work for resolution o...,0
1.0,2011-04-05,"China to invest in hydel, renewable energy pow...",0
2.0,2011-04-06,Govt mulling to empower eight public sector en...,0
3.0,2011-04-06,"China offers financial, technical assistance f...",0


In [6]:
news_data.info()

news_data = news_data.iloc[1:]
news_data.head()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 57997 entries, nan to 58988.0
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Date      57997 non-null  object
 1   News      57997 non-null  object
 2   Polarity  57997 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.8+ MB


Unnamed: 0,Date,News,Polarity
0.0,2011-04-02,Political parties should work for resolution o...,0
1.0,2011-04-05,"China to invest in hydel, renewable energy pow...",0
2.0,2011-04-06,Govt mulling to empower eight public sector en...,0
3.0,2011-04-06,"China offers financial, technical assistance f...",0
4.0,2011-04-06,Shipping activity at Port Qasim - KARACHI: Shi...,0


In [7]:
news_data['Date'] = pd.to_datetime(news_data['Date']) #Convert to datetime
news_data['Date'] = news_data["Date"].dt.strftime("%d %m %Y") #Format date
news_data['News'].map(str) #Convert news to string

0.0        Political parties should work for resolution o...
1.0        China to invest in hydel, renewable energy pow...
2.0        Govt mulling to empower eight public sector en...
3.0        China offers financial, technical assistance f...
4.0        Shipping activity at Port Qasim - KARACHI: Shi...
                                 ...                        
58984.0    90pc criminals acquitted due to faulty investi...
58985.0    Regulatory bodies: AGP to begin audit decision...
58986.0    Northern South Asian region: Goethe-Institut h...
58987.0    Buzdar making efforts to woo opposition MPAs a...
58988.0    DS inspects KCR track - KARACHI: The newly app...
Name: News, Length: 57996, dtype: object

In [8]:
news_data.tail()


Unnamed: 0,Date,News,Polarity
58984.0,29 01 2021,90pc criminals acquitted due to faulty investi...,0
58985.0,29 01 2021,Regulatory bodies: AGP to begin audit decision...,0
58986.0,29 01 2021,Northern South Asian region: Goethe-Institut h...,0
58987.0,29 01 2021,Buzdar making efforts to woo opposition MPAs a...,0
58988.0,29 01 2021,DS inspects KCR track - KARACHI: The newly app...,0


In [9]:
news_data.drop_duplicates(subset = 'News', keep = False, inplace = True)


In [10]:
news_data.head()

Unnamed: 0,Date,News,Polarity
0.0,02 04 2011,Political parties should work for resolution o...,0
1.0,05 04 2011,"China to invest in hydel, renewable energy pow...",0
2.0,06 04 2011,Govt mulling to empower eight public sector en...,0
3.0,06 04 2011,"China offers financial, technical assistance f...",0
4.0,06 04 2011,Shipping activity at Port Qasim - KARACHI: Shi...,0


In [11]:
news_data.reset_index(drop = True, inplace = True)

In [12]:
news_data.head()

Unnamed: 0,Date,News,Polarity
0,02 04 2011,Political parties should work for resolution o...,0
1,05 04 2011,"China to invest in hydel, renewable energy pow...",0
2,06 04 2011,Govt mulling to empower eight public sector en...,0
3,06 04 2011,"China offers financial, technical assistance f...",0
4,06 04 2011,Shipping activity at Port Qasim - KARACHI: Shi...,0


### Import and stock prices as daily difference

In [13]:
diff = pd.read_csv('Companies diff\ABOT PA Equity.csv')

In [14]:
diff.describe()

Unnamed: 0,Open,High,Low,Close,Volume
count,5157.0,5157.0,5157.0,5157.0,5157.0
mean,0.147057,0.149578,0.145137,0.147993,5.805895
std,8.392596,6.892363,7.040883,6.328886,62090.918356
min,-88.8844,-58.9713,-41.2265,-45.0148,-925733.0
25%,-0.7633,-0.8252,-0.6606,-0.7319,-3408.0
50%,0.0,0.0,0.0,0.0,0.0
75%,0.8546,0.8822,0.8114,0.7571,3200.0
max,83.7564,43.7413,52.9888,52.1436,918700.0


In [15]:
diff['Date'] = pd.to_datetime(diff['Date']) #Convert date to datetime

In [16]:
diff.head()

Unnamed: 0,Open,High,Low,Close,Volume,Date
0,0.0,0.0,0.0,0.0,0.0,2001-01-03
1,0.0,0.0,0.0,0.0,0.0,2001-01-04
2,0.0,0.0,0.0,0.0,0.0,2001-01-05
3,0.0,0.0,0.0,0.0,0.0,2001-01-08
4,0.0,0.0,0.0,0.0,0.0,2001-01-09


In [17]:
diff.tail()

Unnamed: 0,Open,High,Low,Close,Volume,Date
5152,0.99,-1.0,0.0,-11.2,250.0,2020-10-02
5153,-6.0,-1.0,-33.0,-3.58,25250.0,2020-10-05
5154,-10.0,-9.0,23.0,-2.12,-22950.0,2020-10-06
5155,19.55,11.0,0.05,13.62,12700.0,2020-10-07
5156,6.45,9.0,16.05,6.78,6600.0,2020-10-08


### Make relevant functions to map polarities and take into account weekends

In [18]:
def pol(val):
    #print(val)
    if val >= 0:
        
        return 1
    
    else:
        
        return 0
    


# Year, month, day	
def check(x):
    d3 = datetime.datetime.strptime(x, '%d %m %Y')
    d2 = datetime.datetime.strptime(x, '%d %m %Y').weekday()
   # print(d3)
    ##print(d2)

    if (d2 == 5):
        d1 = d3 - datetime.timedelta(days = 1)
    elif (d2 == 6):
        d1 = d3 - datetime.timedelta(days = 2)
    else:
        d1 = d3
    #print(d1)    
    return d1


check('08 02 2009')

datetime.datetime(2009, 2, 6, 0, 0)

In [19]:
news_data.head()

Unnamed: 0,Date,News,Polarity
0,02 04 2011,Political parties should work for resolution o...,0
1,05 04 2011,"China to invest in hydel, renewable energy pow...",0
2,06 04 2011,Govt mulling to empower eight public sector en...,0
3,06 04 2011,"China offers financial, technical assistance f...",0
4,06 04 2011,Shipping activity at Port Qasim - KARACHI: Shi...,0


In [20]:
x = len(news_data)
print(x)

57996


In [21]:
print(len(diff))


5157


### Label data with polarity

In [22]:
count = 0
for i in range(len(news_data)):
    #print(i)
    #print(news_data['Date'][i])
    
    date = check(news_data['Date'][i])
    #print(date)
    index = diff.index[diff['Date'] == date]
    #print(type(index))
    print(index)
    #print(diff['Close'][index])
    if (index <= len(diff)):
        count+=1
        polarity = pol(float(diff['Close'][index]))
        #print(polarity)
        news_data['Polarity'][i] = polarity
count

Int64Index([2672], dtype='int64')
Int64Index([2674], dtype='int64')
Int64Index([2675], dtype='int64')
Int64Index([2675], dtype='int64')
Int64Index([2675], dtype='int64')
Int64Index([2675], dtype='int64')
Int64Index([2675], dtype='int64')
Int64Index([2675], dtype='int64')
Int64Index([2676], dtype='int64')
Int64Index([2676], dtype='int64')
Int64Index([2676], dtype='int64')
Int64Index([2676], dtype='int64')
Int64Index([2676], dtype='int64')
Int64Index([2676], dtype='int64')
Int64Index([2676], dtype='int64')
Int64Index([2676], dtype='int64')
Int64Index([2677], dtype='int64')
Int64Index([2677], dtype='int64')
Int64Index([2677], dtype='int64')
Int64Index([2677], dtype='int64')
Int64Index([2677], dtype='int64')
Int64Index([2677], dtype='int64')
Int64Index([2677], dtype='int64')
Int64Index([2677], dtype='int64')
Int64Index([2677], dtype='int64')
Int64Index([2677], dtype='int64')
Int64Index([2677], dtype='int64')
Int64Index([2677], dtype='int64')
Int64Index([2677], dtype='int64')
Int64Index([26

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  news_data['Polarity'][i] = polarity


Int64Index([2685], dtype='int64')
Int64Index([2685], dtype='int64')
Int64Index([2685], dtype='int64')
Int64Index([2686], dtype='int64')
Int64Index([2686], dtype='int64')
Int64Index([2686], dtype='int64')
Int64Index([2686], dtype='int64')
Int64Index([2686], dtype='int64')
Int64Index([2686], dtype='int64')
Int64Index([2686], dtype='int64')
Int64Index([2687], dtype='int64')
Int64Index([2687], dtype='int64')
Int64Index([2687], dtype='int64')
Int64Index([2687], dtype='int64')
Int64Index([2687], dtype='int64')
Int64Index([2687], dtype='int64')
Int64Index([2687], dtype='int64')
Int64Index([2687], dtype='int64')
Int64Index([2687], dtype='int64')
Int64Index([2687], dtype='int64')
Int64Index([2687], dtype='int64')
Int64Index([2687], dtype='int64')
Int64Index([2687], dtype='int64')
Int64Index([2688], dtype='int64')
Int64Index([2688], dtype='int64')
Int64Index([2688], dtype='int64')
Int64Index([2688], dtype='int64')
Int64Index([2689], dtype='int64')
Int64Index([2689], dtype='int64')
Int64Index([26

  if (index <= len(diff)):


Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index

50790

In [23]:
news_data = news_data[0:count]
news_data.describe()

Unnamed: 0,Polarity
count,50790.0
mean,0.557767
std,0.496657
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


### Save labelled data

In [24]:
news_data.to_csv(path_or_buf = 'C:/Users/Aun Electronic/Documents/BR_News' + '.csv', index = False) #Save dataframe as CSV