In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.offline import plot, iplot,init_notebook_mode
import plotly.graph_objs as go
init_notebook_mode(connected=True)

In [2]:
inc=pd.read_csv('incidence_per_1000_pop_at_risk.csv')
rpd=pd.read_csv('reported_numbers.csv')

In [3]:
print(inc)

                                 Country  Year  No. of cases  \
0                            Afghanistan  2018         29.01   
1                                Algeria  2018          0.00   
2                                 Angola  2018        228.91   
3                              Argentina  2018          0.00   
4                                Armenia  2018          0.00   
...                                  ...   ...           ...   
2028  Venezuela (Bolivarian Republic of)  2000          2.94   
2029                            Viet Nam  2000          3.42   
2030                               Yemen  2000         47.47   
2031                              Zambia  2000        385.65   
2032                            Zimbabwe  2000        123.00   

                 WHO Region  
0     Eastern Mediterranean  
1                    Africa  
2                    Africa  
3                  Americas  
4                    Europe  
...                     ...  
2028               Am

In [4]:
print(rpd)

                                 Country  Year  No. of cases  No. of deaths  \
0                            Afghanistan  2017      161778.0           10.0   
1                                Algeria  2017           0.0            0.0   
2                                 Angola  2017     3874892.0        13967.0   
3                              Argentina  2017           0.0            1.0   
4                                Armenia  2017           0.0            NaN   
...                                  ...   ...           ...            ...   
1939  Venezuela (Bolivarian Republic of)  2000       29736.0           24.0   
1940                            Viet Nam  2000       74316.0          142.0   
1941                               Yemen  2000     1394495.0            NaN   
1942                              Zambia  2000           NaN            NaN   
1943                            Zimbabwe  2000           NaN            NaN   

                 WHO Region  
0     Eastern Mediter

In [5]:
inc.shape

(2033, 4)

In [6]:
rpd.shape

(1944, 5)

In [7]:
inc.head()

Unnamed: 0,Country,Year,No. of cases,WHO Region
0,Afghanistan,2018,29.01,Eastern Mediterranean
1,Algeria,2018,0.0,Africa
2,Angola,2018,228.91,Africa
3,Argentina,2018,0.0,Americas
4,Armenia,2018,0.0,Europe


In [8]:
inc.tail()

Unnamed: 0,Country,Year,No. of cases,WHO Region
2028,Venezuela (Bolivarian Republic of),2000,2.94,Americas
2029,Viet Nam,2000,3.42,Western Pacific
2030,Yemen,2000,47.47,Eastern Mediterranean
2031,Zambia,2000,385.65,Africa
2032,Zimbabwe,2000,123.0,Africa


In [9]:
rpd.head()

Unnamed: 0,Country,Year,No. of cases,No. of deaths,WHO Region
0,Afghanistan,2017,161778.0,10.0,Eastern Mediterranean
1,Algeria,2017,0.0,0.0,Africa
2,Angola,2017,3874892.0,13967.0,Africa
3,Argentina,2017,0.0,1.0,Americas
4,Armenia,2017,0.0,,Europe


In [10]:
rpd.tail()

Unnamed: 0,Country,Year,No. of cases,No. of deaths,WHO Region
1939,Venezuela (Bolivarian Republic of),2000,29736.0,24.0,Americas
1940,Viet Nam,2000,74316.0,142.0,Western Pacific
1941,Yemen,2000,1394495.0,,Eastern Mediterranean
1942,Zambia,2000,,,Africa
1943,Zimbabwe,2000,,,Africa


In [11]:
rpd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1944 entries, 0 to 1943
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Country        1944 non-null   object 
 1   Year           1944 non-null   int64  
 2   No. of cases   1710 non-null   float64
 3   No. of deaths  1675 non-null   float64
 4   WHO Region     1944 non-null   object 
dtypes: float64(2), int64(1), object(2)
memory usage: 76.1+ KB


In [12]:
inc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2033 entries, 0 to 2032
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Country       2033 non-null   object 
 1   Year          2033 non-null   int64  
 2   No. of cases  2033 non-null   float64
 3   WHO Region    2033 non-null   object 
dtypes: float64(1), int64(1), object(2)
memory usage: 63.7+ KB


DATA CLEANING

In [13]:
#checking null values

In [14]:
rpd.isnull().sum()

Country            0
Year               0
No. of cases     234
No. of deaths    269
WHO Region         0
dtype: int64

In [15]:
#dropping null values

In [16]:
rpd=rpd.dropna()

In [17]:
#confirming null values

In [18]:
rpd.isnull().sum()

Country          0
Year             0
No. of cases     0
No. of deaths    0
WHO Region       0
dtype: int64

In [19]:
inc.isnull().sum()

Country         0
Year            0
No. of cases    0
WHO Region      0
dtype: int64

DATA MANIPULATION

In [20]:
inc.columns

Index(['Country', 'Year', 'No. of cases', 'WHO Region'], dtype='object')

In [21]:
rpd.columns

Index(['Country', 'Year', 'No. of cases', 'No. of deaths', 'WHO Region'], dtype='object')

In [22]:
inc.rename (columns = {'No. of cases':'no_of_cases'}, inplace= True)
inc.rename (columns = {'WHO Region':'who_region'}, inplace= True)
rpd.rename (columns = {'No. of cases':'no_of_cases'}, inplace= True)
rpd.rename (columns = {'No. of deaths':'no_of_deaths'}, inplace= True)
rpd.rename (columns = {'WHO Region':'who_region'}, inplace= True)

In [23]:
rpd.columns

Index(['Country', 'Year', 'no_of_cases', 'no_of_deaths', 'who_region'], dtype='object')

In [24]:
inc.columns

Index(['Country', 'Year', 'no_of_cases', 'who_region'], dtype='object')

In [25]:
#checking datatypes

In [26]:
rpd.dtypes

Country          object
Year              int64
no_of_cases     float64
no_of_deaths    float64
who_region       object
dtype: object

In [27]:
inc.dtypes

Country         object
Year             int64
no_of_cases    float64
who_region      object
dtype: object

In [28]:
#changing the data types

In [29]:
rpd.no_of_cases = rpd.no_of_cases.astype(int)
rpd.no_of_deaths = rpd.no_of_deaths.astype(int)

In [30]:
inc.no_of_cases = inc.no_of_cases.astype(int)

In [31]:
rpd.dtypes

Country         object
Year             int64
no_of_cases      int32
no_of_deaths     int32
who_region      object
dtype: object

In [32]:
inc.dtypes

Country        object
Year            int64
no_of_cases     int32
who_region     object
dtype: object

DATA ANALYSIS AND VISUALISATION

After the data cleaning and manipulation stages the data set is ready for analysis.
There are so many countries in the data also the data set is 18 years long. The best approach would be to use groupby()functionto group this data accordingly and create some visualisations we can gain insight from.


In [33]:
who_region = rpd.groupby(['who_region'])[['no_of_cases','no_of_deaths']].apply(sum).reset_index()
who_region                              

Unnamed: 0,who_region,no_of_cases,no_of_deaths
0,Africa,545111852,1480850
1,Americas,13433321,11039
2,Eastern Mediterranean,15841260,26764
3,Europe,112675,25
4,South-East Asia,38305249,49802
5,Western Pacific,6709491,18330


In [34]:
Year = rpd.groupby(['Year'])[['no_of_cases','no_of_deaths']].apply(sum).reset_index()
Year

Unnamed: 0,Year,no_of_cases,no_of_deaths
0,2000,5279182,21419
1,2001,5534764,26162
2,2002,5335247,70683
3,2003,8243454,91247
4,2004,9389638,87926
5,2005,11170319,76842
6,2006,11898896,78995
7,2007,13365529,76904
8,2008,13395349,87024
9,2009,17454477,115694


In [35]:
countries = rpd.groupby(['Country'])[['no_of_cases','no_of_deaths']].apply(sum).reset_index()
countries

Unnamed: 0,Country,no_of_cases,no_of_deaths
0,Afghanistan,1045271,363
1,Algeria,1044,4
2,Angola,26006152,125364
3,Argentina,2098,2
4,Armenia,355,0
...,...,...,...
100,Venezuela (Bolivarian Republic of),1039480,278
101,Viet Nam,445213,564
102,Yemen,895910,544
103,Zambia,18619166,8898


In [36]:
#plotting charts

In [37]:
fig = px.bar(countries.sort_values('no_of_cases',ascending=False)[:10][::-1],
                                  x='no_of_cases',y='Country',text='no_of_cases',
                                   title='Top 10 countries with highest number of malaria cases from 2000 to 2018')
fig.show()

In [38]:
fig = px.bar(countries.sort_values('no_of_deaths',ascending=False)[:10][::-1],
                                  x='no_of_deaths',y='Country',text='no_of_deaths',
                                   title='Top 10 countries with highest number of malaria deaths from 2000 to 2018')
fig.show()

In [39]:
# pie chart

In [40]:
fig = px.pie(who_region, values='no_of_cases',names='who_region',color='who_region',
            color_discrete_map={'Africa':'lightcyan','South-East Asia':'cyan','Eastern Mediteranian':'royalblue',
                               'Americans':'darkblue','Western Pacific':'blue','Europe':'red'},
            title='MALARIA CASES IN WHO REGIONS')
fig.show()

In [41]:
# line insight line graph

In [42]:
fig = px.line(Year, x='Year', y='no_of_cases', title='MALARIA CASES FROM 2000 TO 2018')
fig.show()

In [43]:
fig= px.line(Year, x='Year', y='no_of_deaths', title='MALARIA DEATHS FROM 2000 TO 2018')
fig.show()

In [44]:
#geographical overview of data

In [46]:
fig= px.choropleth(countries,locationmode='country names',
                  locations ='Country',
                  hover_data=['no_of_cases','no_of_deaths','Country'],
                  hover_name='Country',
                  color='Country',
                  title='MALARIA CASES ACROSS THE WORLD')
fig.show()