## Finding Location from textual data

In [3]:
# Libraries 
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re

%matplotlib inline

## Read in the csv files
Read in the combined csv file.

In [4]:
# Read in data from reliefweb AFTER 2006
# Notice: after the csv name there is an additional argument 
    
df= pd.read_csv("./combined.csv")

In [5]:
# check
df.head()

Unnamed: 0,date,source,text
0,2019-10-21,https://www.tribuneindia.com/news/punjab/flood...,flood-hit farmers to get 9k-quintal wheat seed...
1,2019-10-16,https://www.tribuneindia.com/news/punjab/debt-...,debt relief likely for flood-hit farmers. ruch...
2,2019-10-14,https://www.tribuneindia.com/news/punjab/no-wh...,"no wheat seed disbursal, farmers livid. aparna..."
3,2019-10-14,https://www.tribuneindia.com/news/punjab/farme...,farmers in flood-affected areas to get free wh...
4,2019-10-11,https://www.tribuneindia.com/news/punjab/busin...,business sinks in mandis of flood-hit lohian. ...


In [6]:
# Import Districts
location= pd.read_csv("./latlongpunjab.csv")
location = location['Location']

In [7]:
location = location.str.lower()

In [8]:
#adding some replacements based on more commonly used place names
# fatehgarh sahib = fatehgarh
# sbs nagar = nawanshahr, sbs
# sas nagar = mohali, nagar
# sas nagar = ajitgarh, nagar 
# sri muktsar sahib = muktsar
added_names = pd.Series(['fatehgarh','nawanshahr','mohali','ajitgarh','muktsar','nagar','sbs'])

In [9]:
location = location.append(added_names)

# Finding Location in the Text & URLs

In [10]:
# Write a function to pass in the list
def location_fcn(lst, df):
    for i in lst:
        df[i] = df['source'].str.count(i) + df['text'].str.count(i)

In [11]:
location_fcn(location, df)

In [12]:
df.sum()

date                 2019-10-212019-10-162019-10-142019-10-142019-1...
source               https://www.tribuneindia.com/news/punjab/flood...
text                 flood-hit farmers to get 9k-quintal wheat seed...
amritsar                                                           223
barnala                                                             40
bathinda                                                           199
faridkot                                                            66
fatehgarh sahib                                                     82
fazilka                                                            185
ferozepur                                                          322
gurdaspur                                                          206
hoshiarpur                                                          74
jalandhar                                                          564
kapurthala                                                         227
ludhia

In [13]:
#combining columns for the extra names back to the original name
# fatehgarh sahib = fatehgarh
# sbs nagar = nawanshahr, sbs
# sas nagar = mohali, nagar
# sas nagar = ajitgarh, nagar 
# sri muktsar sahib = muktsar
df['fatehgarh sahib'] = df['fatehgarh sahib'] + df['fatehgarh']
df['sbs nagar'] = df['sbs nagar'] + df['nawanshahr']+ df['sbs']
df['sas nagar'] = df['sas nagar'] + df['mohali']+ df['nagar']+ df['ajitgarh']
df['sri muktsar sahib'] = df['sri muktsar sahib'] + df['muktsar']

In [14]:
df = df.drop(['muktsar','mohali','nagar','ajitgarh','nawanshahr','sbs','fatehgarh'], axis = 1)

In [15]:
df.sum()

date                 2019-10-212019-10-162019-10-142019-10-142019-1...
source               https://www.tribuneindia.com/news/punjab/flood...
text                 flood-hit farmers to get 9k-quintal wheat seed...
amritsar                                                           223
barnala                                                             40
bathinda                                                           199
faridkot                                                            66
fatehgarh sahib                                                    187
fazilka                                                            185
ferozepur                                                          322
gurdaspur                                                          206
hoshiarpur                                                          74
jalandhar                                                          564
kapurthala                                                         227
ludhia

# Finding People, Crops, Homes, Money in the Text & URLs

In [16]:
# Finding People Affected
def people_fcn(lst, df):
    sum_people = 0
    for i in lst:
        sum_people += df['source'].str.count(i) + df['text'].str.count(i)
    df['people'] = sum_people

In [17]:
peeps = ['death', 'die', 'dead', 'people','resident','residents',
                 'kill', 'evacuat', 'mortality','villager','villagers',
                 'victim', 'suffer', 'rescue', 'toll', 'displace','displaced',
                 'missing', 'drown', 'wound', 'survive','persons','family','families',
                 'suffer', 'displace', 'strand', 'displace','person']

In [18]:
people_fcn(peeps, df)

In [19]:
df['people'].value_counts()

0     365
1     270
2     165
3     104
4     101
5      82
6      49
7      48
8      43
10     38
9      28
12     24
11     23
17     15
19     13
15     12
14     11
13      9
18      8
24      7
23      7
16      7
22      6
21      6
33      5
31      3
28      3
20      2
25      2
29      2
30      2
34      2
49      1
26      1
27      1
32      1
37      1
38      1
43      1
46      1
47      1
51      1
Name: people, dtype: int64

In [20]:
#Finding Crops / Livestock Affected
crop_livestock = ['damag', 'loss', 'lost','kill','farm','field','agriculture','stable','horse','horses',
                 'destroy','crop', 'livestock', 'cattle','farmer','agricultural','land','lands','fields',
                  'farmers','acre','acres','grain','basmati','paddy',
                 'cow','sheep','pig','cows','pigs','corn','wheat','seed','soy','rice', 'crops']

In [21]:
# Finding Crops / Livestock Affected
def crops_fcn(lst, df):
    sum_crops = 0
    for i in lst:
        sum_crops += df['source'].str.count(i) + df['text'].str.count(i)
    df['crops'] = sum_crops


In [22]:
crops_fcn(crop_livestock, df)

In [23]:
df['crops'].value_counts()

0      322
2      163
1      134
3      122
4      110
      ... 
49       1
48       1
41       1
39       1
127      1
Name: crops, Length: 70, dtype: int64

In [24]:
#Finding Houses Affected
house = ['damag', 'loss', 'lost', 'evacuat','houses','submerged','rooftop','rooftops','submerge',
                 'affect', 'relief', 'relieve', 'destroy', 'dwelling','structural','village','villages',
                 'house', 'roof', 'home', 'window','damage', 'structure', 'barn','hut']

In [25]:
# Finding Homes Affected
def home_fcn(lst, df):
    sum_homes = 0
    for i in lst:
        sum_homes += df['source'].str.count(i) + df['text'].str.count(i)
    df['homes'] = sum_homes

In [26]:
home_fcn(house, df)

In [27]:
df['homes'].value_counts()

1     384
2     171
3      98
4      78
5      65
     ... 
51      1
50      1
89      1
48      1
91      1
Name: homes, Length: 62, dtype: int64

In [28]:
#Finding the Money
money = ['cost','money','dollar','rupee','economic','economy','value','pounds','euro','compensation', 'rupees','dollars','euros']

In [29]:
# Finding the Money
def money_fcn(lst, df):
    sum_money = 0
    for i in lst:
        sum_money += df['source'].str.count(i) + df['text'].str.count(i)
    df['money'] = sum_money

In [30]:
money_fcn(money, df)

In [31]:
df['money'].value_counts()

0     1177
1      140
2       76
3       35
4       21
6        8
5        5
7        4
11       2
8        2
32       1
12       1
Name: money, dtype: int64

In [32]:
df.sum()

date                 2019-10-212019-10-162019-10-142019-10-142019-1...
source               https://www.tribuneindia.com/news/punjab/flood...
text                 flood-hit farmers to get 9k-quintal wheat seed...
amritsar                                                           223
barnala                                                             40
bathinda                                                           199
faridkot                                                            66
fatehgarh sahib                                                    187
fazilka                                                            185
ferozepur                                                          322
gurdaspur                                                          206
hoshiarpur                                                          74
jalandhar                                                          564
kapurthala                                                         227
ludhia

In [33]:
import pyparsing as pp
import calendar

months_list= []
for month_idx in range(1, 13):
    months_list.append(calendar.month_name[month_idx])
    months_list.append(calendar.month_abbr[month_idx])

months_list = [x.lower() for x in months_list]
# join the list to use it as pyparsing keyword
month_keywords = " ".join(months_list)

# date separator - can be one of '/', '.','-', or ' '
separator = pp.Word("/.,- ")

# Dictionary for numeric date e.g. 12/12/2018
numeric_date = pp.Combine(pp.Word(pp.nums, max=2) + separator + pp.Word(pp.nums, max=2) + separator + pp.Word(pp.nums, max=4))

# Dictionary for text date e.g. 12/Jan/2018
text_date = pp.Combine(pp.Word(pp.nums, max=2) + separator + pp.oneOf(month_keywords) + separator + pp.Word(pp.nums, max=4))
#jan 12, 2018
text_date2 = pp.Combine(pp.oneOf(month_keywords) + separator + pp.Word(pp.nums, max=2) + separator + pp.Word(pp.nums, max=4))

#jan 12
text_date3 = pp.Combine(pp.oneOf(month_keywords) + separator + pp.Word(pp.nums, max=2))

#12 jan
text_date4 = pp.Combine(pp.Word(pp.nums, max=2) + separator + pp.oneOf(month_keywords))

# Finding any type of date - except numeric
date_patterns = [text_date, text_date2, text_date3, text_date4]


In [165]:
#Finding the first date in the text - START HERE CAPTURING DATE
df_dates = {'index_key': [],
            'event_date': []}
date_list = []
# for j in range(0,len(df)):  
for j in range(0,len(df)): 
    for i, date_pattern in enumerate(date_patterns):
        pattern = pp.Suppress(pp.SkipTo(date_pattern)) + date_pattern
        try:
            result = pattern.parseString(df['text'][j])
            df_dates['index_key'].append(j)
            df_dates['event_date'].append(result[0])
#             print(j, result)
        except:
            pass
            

In [166]:
event_dates = pd.DataFrame(df_dates)

In [180]:
# Merging event dates with the dataframe
df3 = pd.merge(df, event_dates, right_on =  "index_key", left_index = True, how="left")

In [189]:
df3['event_date'].fillna("None Found", inplace=True)

In [191]:
#Dropping index key row - not needed now that we've merged
df3.drop("index_key", axis=1, inplace=True)

In [238]:
#Exporting to CSV
df3.to_csv("./location_topic_counts_events.csv", index = False)