# World Data Mini-Project Notebook

Exploratory work and Analysis will be performed in this notebook, up to any modeling that needs to be accomplished.

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from math import sqrt
from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,recall_score,precision_score

import os

import warnings
warnings.filterwarnings("ignore")

In [2]:
from env import host, user, password

In [3]:
# world db all query:
world_all = '''
            SELECT c.*, cy.Name, cy.District, cy.Population, cl.Language, cl.Isofficial, cl.Percentage
            FROM country as c
            JOIN city AS cy ON c.code = cy.countrycode
            JOIN countrylanguage as cl ON c.code = cl.countrycode;
            '''

In [4]:
def get_db(db, user=user, host=host, password=password):
    return f'mysql+pymysql://{user}:{password}@{host}/{db}'

In [5]:
def new_world_data():
    
    '''
    This function reads the world data from the Codeup db into a df,
    writes it to a csv file, and returns the df.
    '''
    
    world_all = '''
                SELECT c.*, cy.Name, cy.District, cy.Population, cl.Language, cl.Isofficial, cl.Percentage
                FROM country as c
                JOIN city AS cy ON c.code = cy.countrycode
                JOIN countrylanguage as cl ON c.code = cl.countrycode;
                '''
    

In [6]:
def get_world_data(cache = False):
    
    world_all = '''
            SELECT c.*, cy.Name, cy.District, cy.Population, cl.Language, cl.Isofficial, cl.Percentage
            FROM country as c
            JOIN city AS cy ON c.code = cy.countrycode
            JOIN countrylanguage as cl ON c.code = cl.countrycode;
            '''
    
    
    filename = 'world_data.csv'
    
    if os.path.isfile(filename):
        return pd.read_csv(filename)
    else:
        df = pd.read_sql(world_all, get_db('world'))
        df.to_csv(filename, index = False)
        
    return df

In [7]:
df = get_world_data()
df

Unnamed: 0,Code,Name,Continent,Region,SurfaceArea,IndepYear,Population,LifeExpectancy,GNP,GNPOld,...,GovernmentForm,HeadOfState,Capital,Code2,Name.1,District,Population.1,Language,Isofficial,Percentage
0,ABW,Aruba,North America,Caribbean,193.0,,103000,78.4,828.0,793.0,...,Nonmetropolitan Territory of The Netherlands,Beatrix,129,AW,Oranjestad,–,29034,Dutch,T,5.3
1,ABW,Aruba,North America,Caribbean,193.0,,103000,78.4,828.0,793.0,...,Nonmetropolitan Territory of The Netherlands,Beatrix,129,AW,Oranjestad,–,29034,English,F,9.5
2,ABW,Aruba,North America,Caribbean,193.0,,103000,78.4,828.0,793.0,...,Nonmetropolitan Territory of The Netherlands,Beatrix,129,AW,Oranjestad,–,29034,Papiamento,F,76.7
3,ABW,Aruba,North America,Caribbean,193.0,,103000,78.4,828.0,793.0,...,Nonmetropolitan Territory of The Netherlands,Beatrix,129,AW,Oranjestad,–,29034,Spanish,F,7.4
4,AFG,Afghanistan,Asia,Southern and Central Asia,652090.0,1919.0,22720000,45.9,5976.0,,...,Islamic Emirate,Mohammad Omar,1,AF,Kabul,Kabol,1780000,Balochi,F,0.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30665,ZWE,Zimbabwe,Africa,Eastern Africa,390757.0,1980.0,11669000,37.8,5951.0,8670.0,...,Republic,Robert G. Mugabe,4068,ZW,Mutare,Manicaland,131367,Shona,F,72.1
30666,ZWE,Zimbabwe,Africa,Eastern Africa,390757.0,1980.0,11669000,37.8,5951.0,8670.0,...,Republic,Robert G. Mugabe,4068,ZW,Gweru,Midlands,128037,English,T,2.2
30667,ZWE,Zimbabwe,Africa,Eastern Africa,390757.0,1980.0,11669000,37.8,5951.0,8670.0,...,Republic,Robert G. Mugabe,4068,ZW,Gweru,Midlands,128037,Ndebele,F,16.2
30668,ZWE,Zimbabwe,Africa,Eastern Africa,390757.0,1980.0,11669000,37.8,5951.0,8670.0,...,Republic,Robert G. Mugabe,4068,ZW,Gweru,Midlands,128037,Nyanja,F,2.2


In [10]:
df['Name.1'].value_counts()

Richmond       36
Springfield    36
San Jose       32
Toledo         27
Cambridge      27
               ..
Tabuk           1
al-Mubarraz     1
Bayamo          1
al-Kharj        1
Montevideo      1
Name: Name.1, Length: 4001, dtype: int64

## Prep/Tidy Data

Needs a lot of prep and cleanup, starting with Column Renaiming.