## Creating Maps and various other things: Python, Pandas, etc.
This is a space for trying out some things, learning new skills, and practicing old ones. 

### Getting Started

In [2]:
# Import the relevant libraries
import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, explained_variance_score, confusion_matrix, accuracy_score, classification_report, log_loss
from math import sqrt

%matplotlib inline

# Increase column width to display df
pd.set_option('display.max_columns', None)

# Increases the size of sns plots
sns.set(rc={'figure.figsize':(12,10)})

In [3]:
from pandas_profiling import ProfileReport

In [4]:
# Load the data
raw_data_x = pd.read_csv('Tanz_train_values.csv')
raw_data_y = pd.read_csv('Tanz_train_labels.csv')

# print the shape
print("Raw_data_x:", raw_data_x.shape)
print("Raw_data_y:", raw_data_y.shape)

Raw_data_x: (59400, 40)
Raw_data_y: (59400, 2)


In [5]:
# Combine 2 original dataframes into one
raw_all = pd.merge(raw_data_x, raw_data_y, on="id", how="inner")
raw_all.shape

(59400, 41)

In [6]:
raw_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 0 to 59399
Data columns (total 41 columns):
id                       59400 non-null int64
amount_tsh               59400 non-null float64
date_recorded            59400 non-null object
funder                   55765 non-null object
gps_height               59400 non-null int64
installer                55745 non-null object
longitude                59400 non-null float64
latitude                 59400 non-null float64
wpt_name                 59400 non-null object
num_private              59400 non-null int64
basin                    59400 non-null object
subvillage               59029 non-null object
region                   59400 non-null object
region_code              59400 non-null int64
district_code            59400 non-null int64
lga                      59400 non-null object
ward                     59400 non-null object
population               59400 non-null int64
public_meeting           56066 non-null object
r

In [7]:
raw_all.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,GeoData Consultants Ltd,VWC,Roman,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280,,GeoData Consultants Ltd,Other,,True,2010,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,Majengo,Manyara,21,4,Simanjiro,Ngorika,250,True,GeoData Consultants Ltd,VWC,Nyumba ya mungu pipe scheme,True,2009,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,58,True,GeoData Consultants Ltd,VWC,,True,1986,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,0,True,GeoData Consultants Ltd,,,True,0,gravity,gravity,gravity,other,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional


In [25]:
df_location = raw_all[["status_group", "longitude", "latitude", "source_class", "population"]]
df_location.head()

Unnamed: 0,status_group,longitude,latitude,source_class,population
0,functional,34.938093,-9.856322,groundwater,109
1,functional,34.698766,-2.147466,surface,280
2,functional,37.460664,-3.821329,surface,250
3,non functional,38.486161,-11.155298,groundwater,58
4,functional,31.130847,-1.825359,surface,0


In [26]:
df_location.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 0 to 59399
Data columns (total 5 columns):
status_group    59400 non-null object
longitude       59400 non-null float64
latitude        59400 non-null float64
source_class    59400 non-null object
population      59400 non-null int64
dtypes: float64(2), int64(1), object(2)
memory usage: 2.7+ MB


## Method for reading an html table from wikipedia to pandas

In [8]:
# Saw this method in a video... 
dfs = pd.read_html("https://en.wikipedia.org/wiki/Demographics_of_the_United_States")

In [9]:
len(dfs)

67

In [10]:
dfs[0]

Unnamed: 0,Demographics of the United States,Demographics of the United States.1
0,Population,"331,449,281 (2020 U.S. Census)[1]"
1,Density,86.16/sq mi (33.27/km2)
2,Growth rate,0.72% (2020)[2]
3,Birth rate,"11.6 births/1,000 population (2020)[2]"
4,Death rate,"8.9 deaths/1,000 population (2020)[2]"
5,Life expectancy,77.8 years (2020)[3]
6,• male,75.1 years[3]
7,• female,80.5 years[3]
8,Fertility rate,1.638 children born/woman (2020)[4]
9,Net migration rate,"3 migrant(s)/1,000 population (2020)[2]"


In [19]:
dfs[25]

Unnamed: 0,Race,Population (2017 est.),Share of total population
0,Total,321004407,100%
1,One race,310923363,96.9%
2,White,234370202,73.0%
3,Black or African American,40610815,12.7%
4,American Indian and Alaska Native,2632102,0.8%
5,Asian,17186320,5.4%
6,Native Hawaiian and Other Pacific Islander,570116,0.2%
7,Other races,15553808,4.8%
8,Two or more races,10081044,3.1%
9,White and Black or African American,2657560,0.8%


In [24]:
# pd.crosstab(raw_all['status_group'],raw_all['basin']).plot.bar()

In [23]:
# ax = sns.barplot(x="status_group", y="num_private", data=raw_all)

In [9]:
# profile = ProfileReport(raw_all, title="Pandas Waterwell Profiling Report")

In [13]:
# profile.to_file("Waterwell_report.html")

In [12]:
# profile

### APPENDIX

In [None]:
# EDA - Look at distribution of categorical variables (bar for each variable - 3 accross)

fig, axes = plt.subplots(round(len(df_not_num.columns) / 3), 3, figsize=(12, 30))

for i, ax in enumerate(fig.axes):
    if i < len(df_not_num.columns):
        ax.set_xticklabels(ax.xaxis.get_majorticklabels(), rotation=45)
        sns.countplot(x=df_not_num.columns[i], alpha=0.7, data=df_not_num, ax=ax)

fig.tight_layout()

In [None]:
# EDA - box and wisker plots to see categorical related to sale price...
plt.figure(figsize = (10, 6))
ax = sns.boxplot(x='BsmtExposure', y='SalePrice', data=df_categ)
plt.setp(ax.artists, alpha=.5, linewidth=2, edgecolor="k")
plt.xticks(rotation=45)

In [None]:
# Plotly

In [None]:
# Example from Project 2 from Mel

fig = px.scatter_mapbox(data_frame=df, lat=df['lat'], lon=df['long'], color="price",
                  color_continuous_scale=px.colors.cyclical.IceFire, size_max=15, zoom=8)
fig.update_layout(mapbox_style="open-street-map")
fig.add_trace(go.Scattermapbox(
    mode = "lines",
    lon = [-180, 180], 
    lat = [47.155, 47.155],
    marker = {'size': 10, 'color': 'violet'}))
fig.add_trace(go.Scattermapbox(
    mode = "lines",
    lon = [-180, 180], 
    lat = [47.26, 47.26],
    marker = {'size': 10, 'color': 'violet'}))
fig.add_trace(go.Scattermapbox(
    mode = "lines",
    lon = [-180, 180], 
    lat = [47.363, 47.363],
    marker = {'size': 10, 'color': 'violet'}))
fig.add_trace(go.Scattermapbox(
    mode = "lines",
    lon = [-180, 180], 
    lat = [47.467, 47.467],
    marker = {'size': 10, 'color': 'violet'}))
fig.add_trace(go.Scattermapbox(
    mode = "lines",
    lon = [-180, 180], 
    lat = [47.57, 47.57],
    marker = {'size': 10, 'color': 'violet'}))
fig.add_trace(go.Scattermapbox(
    mode = "lines",
    lon = [-180, 180], 
    lat = [47.674, 47.674],
    marker = {'size': 10, 'color': 'violet'}))
fig.add_trace(go.Scattermapbox(
    mode = "lines",
    lon = [-180, 180], 
    lat = [47.778, 47.778],
    marker = {'size': 10, 'color': 'violet'}))
fig.update_layout(showlegend=False)
fig.show()

In [None]:
# Correlation matrix - half cut off
corr = df4.corr()
display(corr)
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
plt.figure(figsize=(30,30))
sns.heatmap(corr, cmap='rainbow', mask = mask, annot=True, center=0)