# **Importing libraries**

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# **Loading data**

In [2]:
df = pd.read_csv('india-news-headlines.csv')

In [3]:
df.head()

Unnamed: 0,publish_date,headline_category,headline_text
0,20010102,unknown,Status quo will not be disturbed at Ayodhya; s...
1,20010102,unknown,Fissures in Hurriyat over Pak visit
2,20010102,unknown,America's unwanted heading for India?
3,20010102,unknown,For bigwigs; it is destination Goa
4,20010102,unknown,Extra buses to clear tourist traffic


In [4]:
df.shape

(3650970, 3)

In [5]:
df['headline_category'].value_counts()

india                                                    297491
unknown                                                  209583
city.mumbai                                              150451
city.delhi                                               137648
business.india-business                                  121195
                                                          ...  
nepal-india-earthquake.opinion                                8
elections.lok-sabha-elections-2019.tripura.news               8
best-products.home-decor-and-garden.living-room-decor         8
profiles.india-profiles                                       8
pms-us-visit                                                  8
Name: headline_category, Length: 1041, dtype: int64

# **Grouping Flood occurences**

In [6]:
flood_occurrences = []

In [7]:
for idx, row in df.iterrows():
    if ('FLOOD' in row['headline_text']) or ('flood' in row['headline_text']) or ('Flood' in row['headline_text']):
        flood_occurrences.append(row)

In [8]:
flood = pd.DataFrame(flood_occurrences)

In [9]:
flood

Unnamed: 0,publish_date,headline_category,headline_text
213,20010104,unknown,PIL urges to seek flood relief from UN
776,20010125,unknown,Illegal arms flood Delhi
1189,20010203,unknown,First the quake; then the flood
1217,20010203,unknown,First the quake; then the flood
4130,20010509,unknown,A masterplan to stop flooding
...,...,...,...
3647836,20220325,city.jaipur,Rajasthan rural hospitals in sick bed as 'extr...
3648619,20220327,city.hyderabad,Telangana: TRS to flood PM Narendra Modi with ...
3649770,20220329,city.indore,Narmada pipeline bursts; busy MR-10 near Star ...
3649781,20220329,city.vijayawada,HC issues notice to govt in PIL on Kadapa floods


In [10]:
flood['year'] = (flood['publish_date']/10000).astype(int)
flood['month'] = ((flood['publish_date']/100)%100).astype(int)

In [11]:
flood

Unnamed: 0,publish_date,headline_category,headline_text,year,month
213,20010104,unknown,PIL urges to seek flood relief from UN,2001,1
776,20010125,unknown,Illegal arms flood Delhi,2001,1
1189,20010203,unknown,First the quake; then the flood,2001,2
1217,20010203,unknown,First the quake; then the flood,2001,2
4130,20010509,unknown,A masterplan to stop flooding,2001,5
...,...,...,...,...,...
3647836,20220325,city.jaipur,Rajasthan rural hospitals in sick bed as 'extr...,2022,3
3648619,20220327,city.hyderabad,Telangana: TRS to flood PM Narendra Modi with ...,2022,3
3649770,20220329,city.indore,Narmada pipeline bursts; busy MR-10 near Star ...,2022,3
3649781,20220329,city.vijayawada,HC issues notice to govt in PIL on Kadapa floods,2022,3


In [129]:
flood_freq = pd.DataFrame(columns = ['Year', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'])
for year in range(2001, 2023):
    filtered_df = flood[flood['year'] == year]

    month_frequency = filtered_df['month'].value_counts().sort_index()
    month_frequency = pd.DataFrame(month_frequency)

    months = month_frequency.shape[0]
    while months<12:
        temp = pd.DataFrame({'month':[0]})
        month_frequency = pd.concat([month_frequency, temp], ignore_index=True)
        months+=1

    month_frequency = month_frequency.reset_index(drop=True)
    temp = pd.DataFrame({
        'Year':[year], 
        'January':[month_frequency['month'][0]], 
        'February':[month_frequency['month'][1]], 
        'March':[month_frequency['month'][2]],
        'April':[month_frequency['month'][3]],
        'May':[month_frequency['month'][4]],
        'June':[month_frequency['month'][5]],
        'July':[month_frequency['month'][6]],
        'August':[month_frequency['month'][7]],
        'September':[month_frequency['month'][8]],
        'October':[month_frequency['month'][9]],
        'November':[month_frequency['month'][10]],
        'December':[month_frequency['month'][11]],
    })
    
    flood_freq = pd.concat([flood_freq, temp])


In [133]:
flood_freq = flood_freq.reset_index(drop=True)

In [134]:
flood_freq.to_excel("Flood Data.xlsx")

# **Grouping Cholera Occurences**

In [135]:
cholera_occurrences = []

In [153]:
for idx, row in df.iterrows():
    if ('CHOLERA' in row['headline_text']) or ('cholera' in row['headline_text']) or ('Cholera' in row['headline_text']) or ('Diarrhea' in row['headline_text']) or ('diarrhea' in row['headline_text']) or ('DIARRHEA' in row['headline_text']) or ('Typhoid' in row['headline_text']) or ('typhoid' in row['headline_text']) or ('TYPHOID' in row['headline_text']) or ('Dengue' in row['headline_text']) or ('dengue' in row['headline_text']) or ('DENGUE' in row['headline_text']):
        cholera_occurrences.append(row)

In [154]:
cholera = pd.DataFrame(cholera_occurrences)

In [155]:
cholera

Unnamed: 0,publish_date,headline_category,headline_text
143,20010104,unknown,Cholera outbreak source traced
175,20010104,unknown,Cholera outbreak source traced
195,20010104,unknown,Cholera outbreak source traced
2530,20010321,unknown,After gastro; cholera rears its head
6917,20010713,city.ahmedabad,Govt declares Vadodara 'cholera-threatened'
...,...,...,...
3641581,20220313,city.hyderabad,Hyderabad researchers keen to test first ever ...
3645647,20220321,city.kolkata,BMC starts dengue prevention drive
3647030,20220324,city.delhi,EDMC to involve residents in fight against dengue
3648662,20220327,city.kolkata,Kolkata: Dengue drives start in BMC; New Town


In [156]:
cholera['year'] = (cholera['publish_date']/10000).astype(int)
cholera['month'] = ((cholera['publish_date']/100)%100).astype(int)

In [157]:
cholera

Unnamed: 0,publish_date,headline_category,headline_text,year,month
143,20010104,unknown,Cholera outbreak source traced,2001,1
175,20010104,unknown,Cholera outbreak source traced,2001,1
195,20010104,unknown,Cholera outbreak source traced,2001,1
2530,20010321,unknown,After gastro; cholera rears its head,2001,3
6917,20010713,city.ahmedabad,Govt declares Vadodara 'cholera-threatened',2001,7
...,...,...,...,...,...
3641581,20220313,city.hyderabad,Hyderabad researchers keen to test first ever ...,2022,3
3645647,20220321,city.kolkata,BMC starts dengue prevention drive,2022,3
3647030,20220324,city.delhi,EDMC to involve residents in fight against dengue,2022,3
3648662,20220327,city.kolkata,Kolkata: Dengue drives start in BMC; New Town,2022,3


In [158]:
cholera_freq = pd.DataFrame(columns = ['Year', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'])
for year in range(2001, 2023):
    filtered_df = cholera[cholera['year'] == year]

    month_frequency = filtered_df['month'].value_counts().sort_index()
    month_frequency = pd.DataFrame(month_frequency)

    months = month_frequency.shape[0]
    while months<12:
        temp = pd.DataFrame({'month':[0]})
        month_frequency = pd.concat([month_frequency, temp], ignore_index=True)
        months+=1

    month_frequency = month_frequency.reset_index(drop=True)
    temp = pd.DataFrame({
        'Year':[year], 
        'January':[month_frequency['month'][0]], 
        'February':[month_frequency['month'][1]], 
        'March':[month_frequency['month'][2]],
        'April':[month_frequency['month'][3]],
        'May':[month_frequency['month'][4]],
        'June':[month_frequency['month'][5]],
        'July':[month_frequency['month'][6]],
        'August':[month_frequency['month'][7]],
        'September':[month_frequency['month'][8]],
        'October':[month_frequency['month'][9]],
        'November':[month_frequency['month'][10]],
        'December':[month_frequency['month'][11]],
    })
    
    cholera_freq = pd.concat([cholera_freq, temp])


In [150]:
cholera_freq = cholera_freq.reset_index(drop=True)

In [152]:
cholera_freq.to_excel("Cholera data.xlsx")