In [1]:
import pandas as pd
import numpy as np
import re

import matplotlib.pyplot as plt
%matplotlib inline

## Read csv files

In [5]:
flood_text = pd.read_csv('../data/trib_flood_text.csv')

flooded_text = pd.read_csv('../data/trib_flooded_text.csv')

flooding_text = pd.read_csv('../data/trib_flooding_text.csv')

monsoon_text = pd.read_csv('../data/trib_monsoon_text.csv')

heavy_rain_text = pd.read_csv('../data/trib_heavy_rain_text.csv')

In [6]:
flood_url = pd.read_csv('../data/trib_flood_url.csv')

flooded_url = pd.read_csv('../data/trib_flooded_url.csv')

flooding_url = pd.read_csv('../data/trib_flooding_url.csv')

monsoon_url = pd.read_csv('../data/trib_monsoon_url.csv')

heavy_rain_url = pd.read_csv('../data/trib_heavy_rain_url.csv')

## Formatting

- Make date column datetime object

- Enter full url into url column 

- Format combined csv into 3 columns: date, url and combined text (text)

- Combine headline and text

### *make date a datetime object*

In [7]:
url_list = [flood_url, flooded_url, flooding_url, monsoon_url, heavy_rain_url]

def to_datetime(df_list):
    
    for df in df_list:  
        #removing time stamp
        df['date']= df['date'].map(lambda x: x[:-10])
        
        #converting to datetime object
        df['date'] = pd.to_datetime(df['date'])

to_datetime(url_list)

### *check to see if date changed to datetime*

In [8]:
def datatype(df_list):
    for df in df_list:
        print(df.info())
        print('')
        
datatype(url_list)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 4 columns):
headline    301 non-null object
date        301 non-null datetime64[ns]
caption     301 non-null object
url         301 non-null object
dtypes: datetime64[ns](1), object(3)
memory usage: 9.5+ KB
None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232 entries, 0 to 231
Data columns (total 4 columns):
headline    232 non-null object
date        232 non-null datetime64[ns]
caption     232 non-null object
url         232 non-null object
dtypes: datetime64[ns](1), object(3)
memory usage: 7.4+ KB
None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 4 columns):
headline    120 non-null object
date        120 non-null datetime64[ns]
caption     120 non-null object
url         120 non-null object
dtypes: datetime64[ns](1), object(3)
memory usage: 3.9+ KB
None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 323 entries, 0 to 322
Data columns 

### *add full url to url column*

In [9]:
flood_url['url'] = flood_url['url'].map(lambda x: 'https://www.tribuneindia.com' + x)
flooded_url['url'] = flooded_url['url'].map(lambda x: 'https://www.tribuneindia.com' + x)
flooding_url['url'] = flooding_url['url'].map(lambda x: 'https://www.tribuneindia.com' + x)
monsoon_url['url'] = monsoon_url['url'].map(lambda x: 'https://www.tribuneindia.com' + x)
heavy_rain_url['url'] = heavy_rain_url['url'].map(lambda x: 'https://www.tribuneindia.com' + x)

### *scaled df*

In [10]:
flood_url = flood_url[['date', 'url', 'headline']]
flooded_url = flooded_url[['date', 'url', 'headline']]
flooding_url = flooding_url[['date', 'url', 'headline']]
monsoon_url = monsoon_url[['date', 'url', 'headline']]
heavy_rain_url = heavy_rain_url[['date', 'url', 'headline']]

In [11]:
flood_url.head()

Unnamed: 0,date,url,headline
0,2019-10-21,https://www.tribuneindia.com/news/punjab/flood...,Flood-hit farmers to get 9k-quintal wheat seed
1,2019-10-16,https://www.tribuneindia.com/news/punjab/debt-...,Debt relief likely for flood-hit farmers
2,2019-10-14,https://www.tribuneindia.com/news/punjab/no-wh...,"No wheat seed disbursal, farmers livid"
3,2019-10-14,https://www.tribuneindia.com/news/punjab/farme...,Farmers in flood-affected areas to get free wh...
4,2019-10-11,https://www.tribuneindia.com/news/punjab/busin...,Business sinks in mandis of flood-hit Lohian


### *Combine headline from df_url with text from df_text*

In [9]:
flood = flood_url.join(flood_text)
flooded = flooded_url.join(flooded_text)
flooding = flooding_url.join(flooding_text)
monsoon = monsoon_url.join(monsoon_text)
heavy_rain = heavy_rain_url.join(heavy_rain_text)

In [10]:
df_list = [flood, flooded, flooding, monsoon, heavy_rain]

datatype(df_list)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 4 columns):
date        301 non-null datetime64[ns]
url         301 non-null object
headline    301 non-null object
text        301 non-null object
dtypes: datetime64[ns](1), object(3)
memory usage: 9.5+ KB
None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232 entries, 0 to 231
Data columns (total 4 columns):
date        232 non-null datetime64[ns]
url         232 non-null object
headline    232 non-null object
text        232 non-null object
dtypes: datetime64[ns](1), object(3)
memory usage: 7.4+ KB
None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 4 columns):
date        120 non-null datetime64[ns]
url         120 non-null object
headline    120 non-null object
text        120 non-null object
dtypes: datetime64[ns](1), object(3)
memory usage: 3.9+ KB
None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 323 entries, 0 to 322
Data columns 

In [11]:
df_list = [flood, flooded, flooding, monsoon, heavy_rain]

for df in df_list: 
    df['text'] = df['headline'] + '. ' + df['text']
    df.drop(columns=['headline'], inplace=True)

## Check null values

In [12]:
def null_vals(df_list):
    for df in df_list:
        print(df.isnull().sum())
        print('')
        
null_vals(df_list)

date    0
url     0
text    0
dtype: int64

date    0
url     0
text    0
dtype: int64

date    0
url     0
text    0
dtype: int64

date    0
url     0
text    1
dtype: int64

date    0
url     0
text    0
dtype: int64



### *drop null value in monsoon*

In [13]:
#find index of null value
monsoon[monsoon['text'].isnull()].index.tolist()

[25]

In [14]:
monsoon.loc[monsoon[monsoon['text'].isnull()].index]

Unnamed: 0,date,url,text
25,2019-08-08,https://www.tribuneindia.com/news/punjab/facin...,


In [15]:
monsoon.shape

(323, 3)

In [16]:
monsoon.drop(index=25, inplace=True)

In [17]:
monsoon.shape

(322, 3)

### *check datatypes*

In [18]:
datatype(df_list)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 3 columns):
date    301 non-null datetime64[ns]
url     301 non-null object
text    301 non-null object
dtypes: datetime64[ns](1), object(2)
memory usage: 7.2+ KB
None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232 entries, 0 to 231
Data columns (total 3 columns):
date    232 non-null datetime64[ns]
url     232 non-null object
text    232 non-null object
dtypes: datetime64[ns](1), object(2)
memory usage: 5.6+ KB
None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 3 columns):
date    120 non-null datetime64[ns]
url     120 non-null object
text    120 non-null object
dtypes: datetime64[ns](1), object(2)
memory usage: 2.9+ KB
None

<class 'pandas.core.frame.DataFrame'>
Int64Index: 322 entries, 0 to 322
Data columns (total 3 columns):
date    322 non-null datetime64[ns]
url     322 non-null object
text    322 non-null object
dtypes: datetime64[ns

## Clean text 

- lower()

- replace line breaks with spaces

- keep punctation -- not done

In [30]:
#adapted from Dominika Twardowski

def text_processing(df):

    # change line breaks into a space
    df['text'] = df['text'].replace(r"[\r\n]+", " ", regex=True)
    
    # change text to lower case  
    df['text'] = df['text'].map(lambda x: x.lower())
    

In [31]:
for df in df_list: 
    text_processing(df)

In [36]:
heavy_rain.text

0      rain damages crops. our correspondent fazilka,...
1      six pilgrims from punjab killed in uttarakhand...
2      storage in dams better than 10-yr average. vij...
3      amid deluge, jalandhar welcomes ‘flood babies’...
4      bhakra water level to be reduced by 5 ft. vija...
                             ...                        
113    rain-hit villagers protest govt apathy in aboh...
114    woman dies in roof collapse. our correspondent...
115    wheat crop in moga flattened. kulwinder sandhu...
116    rain, high-speed winds damage crops in malwa. ...
117    heavy rain lashes chandigarh, parts of punjab,...
Name: text, Length: 118, dtype: object

## Combine dataframes

In [38]:
tribune_data = pd.concat(df_list, ignore_index=True)

In [39]:
tribune_data.isnull().sum()

date    0
url     0
text    0
dtype: int64

In [41]:
tribune_data.rename(columns={'url': 'source'}, inplace=True)

In [42]:
tribune_data.to_csv('../data/tribune_data_clean.csv', index=False)