# Analysis of Text 

**Purpose:** In this notebook the individual keyword searches are combined into one dataframe with an additional column which is related to what keyword search the row was created from. 


In [1]:
# Libraries 
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re

%matplotlib inline

In [2]:
# Read in the new csv file
# Notice: additional  parse_dates argument     
combined_df= pd.read_csv("../data/combined.csv",
                            parse_dates = ["date"])

In [3]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1225 entries, 0 to 1224
Data columns (total 4 columns):
index     1225 non-null int64
date      1225 non-null datetime64[ns]
source    1225 non-null object
text      1225 non-null object
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 38.4+ KB


In [4]:
# check if have any duplicated articles 
dups = combined_df["text"]
combined_df[dups.isin(dups[dups.duplicated()])].sort_values("text")

Unnamed: 0,index,date,source,text


In [5]:
# function to make dataframe from search_keywords
def imp_sentences(dataframe, search_keywords):

    sentences_list = []
    
    numeric_words = [" zero","one ", " two", " three", " four", " five", " six", 
                 "seven", "eight", "nine", " ten", "eleven", "twelve", "thirteen", "fourteen",
                 "fifteen", "sixteen", "seventeen", "eighteen", "nineteen","twenty",
                 "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety",
                 "hundred", "thousand", "million", "billion", "trillion"]


    # function for finding string with a number in it
    def hasnumber(inputsentence):
        return bool(re.search(r'\d', inputsentence))
    
    # function for finding string with numeric word in it
    def hasnumword(inputsentence):
        has_numericword = False
        for keyword in numeric_words:
            if len(re.findall(keyword,inputsentence))>0:
                has_numericword = True
        return has_numericword

    # loop through the entire dataframe
    for idx in range(0,dataframe.shape[0]):
        row = dataframe.iloc[idx,:]

        # split the row on ". " 
        # sentences = row["text"].split(". " )
        sentences = re.split(r'\.\D', row["text"])
        date = row["date"]
        source = row["source"]

        # for each sentence of the row
        for sentence in sentences:
            has_keyword = False
            for keyword in search_keywords:
                #if sentence has keyword:
                # length will be greater than zero
                if len(re.findall(keyword, sentence))>0:
                     #set has_keyword to True
                    has_keyword=True
                     #break out of the loop
                    break

            number_bool = hasnumber(sentence) 
            has_numericword = hasnumword(sentence)

            # if the sentence has at least one of the keywords
            if (has_keyword == True and number_bool == True) or (has_keyword == True and has_numericword == True) :
                sentence_info = {}
                sentence_info["date"] = date
                sentence_info["source"] = source
                sentence_info['sentence'] = sentence
                # add the sentence and the date to the sentence list
                sentences_list.append(sentence_info)  
                
    search_keywords_df =  pd.DataFrame(sentences_list)
    return search_keywords_df

In [6]:
# keywords

total_keywords= ['crop', 'livestock', 'cattle', "farm", 'wheat', 'rice', 'barley',
                 'agricult', 'acre',
                'death', ' die', 'dead','kill', 'drown', "mortality",
                'victim', 'suffer', 'rescue', 'missing', 'evacuat', 'help', 'affect',
                'relief', 'wound', 'survive', 'displace', 'strand','lost', 'damag',
                'destroy', 'econom', 'cost', 'rehabilit'
                'house', 'home', 'roof', 'street', 'road', 'bridge']
# find total search_keywords
imp_sentences(combined_df, total_keywords)

Unnamed: 0,date,source,sentence
0,2019-10-21,https://www.tribuneindia.com/news/punjab/flood...,flood-hit farmers to get 9k-quintal wheat seed
1,2019-10-21,https://www.tribuneindia.com/news/punjab/flood...,"jalandhar, october 20 in a major initiative a..."
2,2019-10-21,https://www.tribuneindia.com/news/punjab/flood...,"he said in total, 25,000 quintals of wheat see..."
3,2019-10-21,https://www.tribuneindia.com/news/punjab/flood...,"he said of this, nearly 9,000 quintals of seed..."
4,2019-10-21,https://www.tribuneindia.com/news/punjab/flood...,chief agriculture officer naazar singh said ...
...,...,...,...
3394,2007-07-17,https://reliefweb.int/node/244592,- 8239 people are staying in 93 relief centres
3395,2007-07-17,https://reliefweb.int/node/244592,"- due to landslide in darjeeling, the national..."
3396,2007-07-17,https://reliefweb.int/node/244592,- about 105378 people are still staying in 373...
3397,2007-07-17,https://reliefweb.int/node/244592,loss of human lives -as per provisional inform...


In [7]:
total_keyword_one_search_df= imp_sentences(combined_df, total_keywords)

In [8]:
total_keyword_one_search_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3399 entries, 0 to 3398
Data columns (total 3 columns):
date        3399 non-null datetime64[ns]
source      3399 non-null object
sentence    3399 non-null object
dtypes: datetime64[ns](1), object(2)
memory usage: 79.8+ KB


In [9]:
# Find duplicated sentences based on duplicated text  in the total_keyword search
dups = total_keyword_one_search_df["sentence"]
total_keyword_one_search_df[dups.isin(dups[dups.duplicated()])].sort_values("sentence")

Unnamed: 0,date,source,sentence
2336,2010-07-12,https://reliefweb.int/node/361003,- 05 teams of 8th bn ndrf consisting of 206 p...
2328,2010-07-13,https://reliefweb.int/node/360959,- 05 teams of 8th bn ndrf consisting of 206 p...
3371,2007-08-17,https://reliefweb.int/node/240777,- 1437 villages in 19 districts with populati...
3343,2007-08-18,https://reliefweb.int/node/240663,- 1437 villages in 19 districts with populati...
2556,2009-07-29,https://reliefweb.int/node/318915,- 19454 livestock have been perished so far
...,...,...,...
2150,2014-09-15,https://reliefweb.int/node/691554,thirty generator sets of 3 to 5 kva capacity h...
2147,2014-09-15,https://reliefweb.int/node/691734,"to restore the road connectivity, five task fo..."
2161,2014-09-15,https://reliefweb.int/node/691554,"to restore the road connectivity, five task fo..."
1938,2013-08-06,https://www.dainiktribuneonline.com/2013/08/वर...,udhiana 6 august nine torrential rains for mor...


In [10]:
#keywords
agri_keywords=['crop', 'livestock', 'cattle', "farm", 'wheat', 'rice', 'barley',
              'agricult', 'acre'
              ]

# find agriculture data 
imp_sentences(combined_df, agri_keywords)

Unnamed: 0,date,source,sentence
0,2019-10-21,https://www.tribuneindia.com/news/punjab/flood...,flood-hit farmers to get 9k-quintal wheat seed
1,2019-10-21,https://www.tribuneindia.com/news/punjab/flood...,"jalandhar, october 20 in a major initiative a..."
2,2019-10-21,https://www.tribuneindia.com/news/punjab/flood...,"he said in total, 25,000 quintals of wheat see..."
3,2019-10-21,https://www.tribuneindia.com/news/punjab/flood...,"he said of this, nearly 9,000 quintals of seed..."
4,2019-10-21,https://www.tribuneindia.com/news/punjab/flood...,chief agriculture officer naazar singh said ...
...,...,...,...
1111,2008-08-15,https://reliefweb.int/node/277055,- 344 cattle/livestock are reported to have be...
1112,2008-06-16,https://reliefweb.int/node/270161,"india: assam reels under floods, orissa sounds..."
1113,2007-08-18,https://reliefweb.int/node/240663,- 341 cattle camps continue to run in the affe...
1114,2007-08-17,https://reliefweb.int/node/240777,- several low lying areas and agricultural fi...


In [11]:
agri_df = imp_sentences(combined_df, agri_keywords)

In [12]:
# add column named to specify what type it is 
agri_df["topic"] = "agri"

# check
agri_df.head()

Unnamed: 0,date,source,sentence,topic
0,2019-10-21,https://www.tribuneindia.com/news/punjab/flood...,flood-hit farmers to get 9k-quintal wheat seed,agri
1,2019-10-21,https://www.tribuneindia.com/news/punjab/flood...,"jalandhar, october 20 in a major initiative a...",agri
2,2019-10-21,https://www.tribuneindia.com/news/punjab/flood...,"he said in total, 25,000 quintals of wheat see...",agri
3,2019-10-21,https://www.tribuneindia.com/news/punjab/flood...,"he said of this, nearly 9,000 quintals of seed...",agri
4,2019-10-21,https://www.tribuneindia.com/news/punjab/flood...,chief agriculture officer naazar singh said ...,agri


In [13]:
# keywords 
death_keywords=['death', ' die', 'dead','kill', 'drown', "mortality"]

# find sentences related to death
imp_sentences(combined_df, death_keywords)

Unnamed: 0,date,source,sentence
0,2019-09-14,https://www.tribuneindia.com/news/punjab/jalan...,"ush({ }); in kapurthala, crop losses have bee..."
1,2019-09-12,https://www.tribuneindia.com/news/punjab/punja...,"initial assessment 6 persons died 1,400 inju..."
2,2019-09-03,https://www.tribuneindia.com/news/punjab/rs-4-...,"our correspondent fazilka, september 2 family..."
3,2019-09-02,https://www.tribuneindia.com/news/punjab/power...,"as many as 25,000 personnel from different de..."
4,2019-08-27,https://www.tribuneindia.com/news/punjab/distr...,in his desperate call to the pcr at 6.57 pm on...
...,...,...,...
411,2007-08-18,https://reliefweb.int/node/240663,- 06 human deaths have been reported in the s...
412,2007-08-18,https://reliefweb.int/node/240663,loss of human lives - as per provisional infor...
413,2007-08-17,https://reliefweb.int/node/240777,"out of the missing, dead bodies of only 08 per..."
414,2007-08-17,https://reliefweb.int/node/240777,loss of human lives - as per provisional infor...


In [14]:
# create dataframe 
death_df = imp_sentences(combined_df, death_keywords)

# add column named to specify what type it is 
death_df["topic"] = "death"

# check
death_df.head()

Unnamed: 0,date,source,sentence,topic
0,2019-09-14,https://www.tribuneindia.com/news/punjab/jalan...,"ush({ }); in kapurthala, crop losses have bee...",death
1,2019-09-12,https://www.tribuneindia.com/news/punjab/punja...,"initial assessment 6 persons died 1,400 inju...",death
2,2019-09-03,https://www.tribuneindia.com/news/punjab/rs-4-...,"our correspondent fazilka, september 2 family...",death
3,2019-09-02,https://www.tribuneindia.com/news/punjab/power...,"as many as 25,000 personnel from different de...",death
4,2019-08-27,https://www.tribuneindia.com/news/punjab/distr...,in his desperate call to the pcr at 6.57 pm on...,death


In [15]:
death_df.loc[100].sentence

' a few hours after the death of her husband and two daughters, the woman gave birth to twins at a private hospital'

In [16]:
# keywords
lives_keywords = ['victim', 'suffer', 'rescue', 'missing', 'evacuat', 'help', 'affect',
                  'relief', 'wound', 'survive', 'displace', 'strand','lost', 'damag',
                  'destroy', 'econom', 'cost', 'rehabilit',
                 ]

# find the lives data 
imp_sentences(combined_df, lives_keywords)

Unnamed: 0,date,source,sentence
0,2019-10-21,https://www.tribuneindia.com/news/punjab/flood...,"jalandhar, october 20 in a major initiative a..."
1,2019-10-21,https://www.tribuneindia.com/news/punjab/flood...,"he said in total, 25,000 quintals of wheat see..."
2,2019-10-21,https://www.tribuneindia.com/news/punjab/flood...,"he said of this, nearly 9,000 quintals of seed..."
3,2019-10-16,https://www.tribuneindia.com/news/punjab/debt-...,"according to a proposal, farmers in 14 distri..."
4,2019-10-16,https://www.tribuneindia.com/news/punjab/debt-...,ush({ }); according to estimates prepared by ...
...,...,...,...
1959,2007-07-17,https://reliefweb.int/node/244592,- 8239 people are staying in 93 relief centres
1960,2007-07-17,https://reliefweb.int/node/244592,"- due to landslide in darjeeling, the national..."
1961,2007-07-17,https://reliefweb.int/node/244592,- about 105378 people are still staying in 373...
1962,2007-07-17,https://reliefweb.int/node/244592,loss of human lives -as per provisional inform...


In [17]:
# create dataframe 
lives_df = imp_sentences(combined_df, lives_keywords)

# add column named to specify what type it is 
lives_df["topic"] = "lives"

# check
lives_df.head()

Unnamed: 0,date,source,sentence,topic
0,2019-10-21,https://www.tribuneindia.com/news/punjab/flood...,"jalandhar, october 20 in a major initiative a...",lives
1,2019-10-21,https://www.tribuneindia.com/news/punjab/flood...,"he said in total, 25,000 quintals of wheat see...",lives
2,2019-10-21,https://www.tribuneindia.com/news/punjab/flood...,"he said of this, nearly 9,000 quintals of seed...",lives
3,2019-10-16,https://www.tribuneindia.com/news/punjab/debt-...,"according to a proposal, farmers in 14 distri...",lives
4,2019-10-16,https://www.tribuneindia.com/news/punjab/debt-...,ush({ }); according to estimates prepared by ...,lives


In [18]:
lives_df.loc[30].sentence

' additionally, the dc sought rs 4 lakh for gratuitous loss due to the floods, rs 30 lakh for search, rescue and temporary shelters in flood-affected villages, rs 80 lakh for temporary accommodation, food and water and rs 15 lakh for clearing attached areas'

In [19]:
# keywords house/infastructure
house_keywords = [ 'house', 'home', 'roof', 'street', 'road', 'bridge', ]

imp_sentences(combined_df, house_keywords)

Unnamed: 0,date,source,sentence
0,2019-09-19,https://www.tribuneindia.com/news/punjab/flood...,"similarly, the government will give a compensa..."
1,2019-09-14,https://www.tribuneindia.com/news/punjab/jalan...,"in jalandhar, the divisional commissioner and..."
2,2019-09-14,https://www.tribuneindia.com/news/punjab/jalan...,"ush({ }); in kapurthala, crop losses have bee..."
3,2019-09-14,https://www.tribuneindia.com/news/punjab/jalan...,telling numbers total loss rs 526 crore to...
4,2019-09-13,https://www.tribuneindia.com/news/punjab/state...,"tribune news service chandigarh/ropar, sept 12..."
...,...,...,...
754,2007-08-18,https://reliefweb.int/node/240663,- 01 state and 36 panchayat roads were damage...
755,2007-08-17,https://reliefweb.int/node/240777,about 52 persons are reported to have been was...
756,2007-08-17,https://reliefweb.int/node/240777,- 01 state and 36 panchayat roads continue to...
757,2007-07-17,https://reliefweb.int/node/244592,- 26 panchayat and 06 state roads which were d...


In [20]:
# create dataframe 
houses_df = imp_sentences(combined_df, house_keywords)

# add column named to specify what type it is 
houses_df["topic"] = "house"

# check
houses_df.head()

Unnamed: 0,date,source,sentence,topic
0,2019-09-19,https://www.tribuneindia.com/news/punjab/flood...,"similarly, the government will give a compensa...",house
1,2019-09-14,https://www.tribuneindia.com/news/punjab/jalan...,"in jalandhar, the divisional commissioner and...",house
2,2019-09-14,https://www.tribuneindia.com/news/punjab/jalan...,"ush({ }); in kapurthala, crop losses have bee...",house
3,2019-09-14,https://www.tribuneindia.com/news/punjab/jalan...,telling numbers total loss rs 526 crore to...,house
4,2019-09-13,https://www.tribuneindia.com/news/punjab/state...,"tribune news service chandigarh/ropar, sept 12...",house


In [21]:
houses_df.loc[5].sentence

' according to a report submitted by the district administration, 46,300 residents in 182 villages were affected while standing crop on 11,580 acres and 313 houses were damaged completely'

### combine the dataframe topics together into one dataframe 

In [22]:
# list of all the dataframes which will be combined together
dataframes = [death_df, lives_df, agri_df, houses_df]

# combined dataframe
keyword_concat_df = pd.concat(dataframes, ignore_index=True)

In [23]:
keyword_concat_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4255 entries, 0 to 4254
Data columns (total 4 columns):
date        4255 non-null datetime64[ns]
source      4255 non-null object
sentence    4255 non-null object
topic       4255 non-null object
dtypes: datetime64[ns](1), object(3)
memory usage: 133.1+ KB


In [24]:
total_keyword_one_search_df.info()

## combining all the keyword lists into separate areas adds on a little over 1000 rows

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3399 entries, 0 to 3398
Data columns (total 3 columns):
date        3399 non-null datetime64[ns]
source      3399 non-null object
sentence    3399 non-null object
dtypes: datetime64[ns](1), object(2)
memory usage: 79.8+ KB


In [25]:
total_keyword_one_search_df.loc[20].sentence

'tribune news service  moga, october 13 the state government will provide high-quality wheat crop seeds to flood-hit farmers in moga, jalandhar, kapurthala, ferozepur, fazilka and other flood-affected districts for the upcoming rabi season'

### find duplicated rows in the combined dataframe 

In [26]:
keyword_concat_df[keyword_concat_df.duplicated(keep = False)].sort_values("source")

Unnamed: 0,date,source,sentence,topic
399,2008-08-09,https://reliefweb.int/node/275899,- no human death has been reported in the stat...,death
396,2008-08-09,https://reliefweb.int/node/275899,- no human death has been reported in the stat...,death
395,2008-08-09,https://reliefweb.int/node/275899,- no human death has been reported in the stat...,death
400,2008-08-09,https://reliefweb.int/node/275899,- no human death has been reported in the stat...,death
398,2008-08-09,https://reliefweb.int/node/275899,- no human death has been reported in the stat...,death
...,...,...,...,...
2905,2018-05-21,https://www.tribuneindia.com/news/punjab/shahk...,"farmers rue that despite huge damage, no one b...",agri
1036,2018-05-21,https://www.tribuneindia.com/news/punjab/shahk...,"farmers rue that despite huge damage, no one b...",lives
1033,2018-05-21,https://www.tribuneindia.com/news/punjab/shahk...,"farmers rue that despite huge damage, no one b...",lives
2758,2019-09-20,https://www.tribuneindia.com/news/punjab/then-...,but there is no shortage of material nor have ...,agri


There are 67 duplicate rows. The duplicates make sense for the reliefweb articles since these articles report on multiple states and locations in one article and they repeat the sentence "no human death has been reported in ..." if there was no death in a location. 

In [27]:
# Keeping the first appearance of a duplicated text
keyword_concat_df = keyword_concat_df.drop_duplicates( keep = "first")
keyword_concat_df = keyword_concat_df.reset_index()

In [28]:
# Reset the index
keyword_concat_df = keyword_concat_df.reset_index()

In [29]:
# check that dropped all duplicated rows
keyword_concat_df[keyword_concat_df.duplicated()]

Unnamed: 0,level_0,index,date,source,sentence,topic


In [30]:
keyword_concat_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4213 entries, 0 to 4212
Data columns (total 6 columns):
level_0     4213 non-null int64
index       4213 non-null int64
date        4213 non-null datetime64[ns]
source      4213 non-null object
sentence    4213 non-null object
topic       4213 non-null object
dtypes: datetime64[ns](1), int64(2), object(3)
memory usage: 197.6+ KB


Now we just keep the first appearance of a duplicate row. Now the dataframe is 4213 entries.

## save to csv 

In [31]:
# save to csv file 
# create csv file from the new dataframe

# dataframe of the individual searches combined together with a topic column!
keyword_concat_df.to_csv("../data/keyword_topic_sentences.csv", index = False)

In [32]:
keyword_concat_df.head()

Unnamed: 0,level_0,index,date,source,sentence,topic
0,0,0,2019-09-14,https://www.tribuneindia.com/news/punjab/jalan...,"ush({ }); in kapurthala, crop losses have bee...",death
1,1,1,2019-09-12,https://www.tribuneindia.com/news/punjab/punja...,"initial assessment 6 persons died 1,400 inju...",death
2,2,2,2019-09-03,https://www.tribuneindia.com/news/punjab/rs-4-...,"our correspondent fazilka, september 2 family...",death
3,3,3,2019-09-02,https://www.tribuneindia.com/news/punjab/power...,"as many as 25,000 personnel from different de...",death
4,4,4,2019-08-27,https://www.tribuneindia.com/news/punjab/distr...,in his desperate call to the pcr at 6.57 pm on...,death


**Summary**: One large topic concatenated dataframe was created and it has approximately 1000 more rows than a dataframe just passing through the combined keyword search. This means that that some of the sentences are being selected in multiple keyword searches. Next, this csv file will be used to create a SQL database and visuals to represent the findings. 