In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# train test split from sklearn
from sklearn.model_selection import train_test_split
# imputer from sklearn
from sklearn.impute import SimpleImputer

# filter out warnings
import warnings
warnings.filterwarnings('ignore')

# our own acquire script:
import acquire
from pydataset import data
from env import get_db_url

In [2]:
df_iris = data('iris')
print(df_iris.head(3))

   Sepal.Length  Sepal.Width  Petal.Length  Petal.Width Species
1           5.1          3.5           1.4          0.2  setosa
2           4.9          3.0           1.4          0.2  setosa
3           4.7          3.2           1.3          0.2  setosa


In [3]:
print(df_iris.shape)

(150, 5)


In [5]:
print(df_iris.columns)

Index(['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width',
       'Species'],
      dtype='object')


In [5]:
print(df_iris.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150 entries, 1 to 150
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Sepal.Length  150 non-null    float64
 1   Sepal.Width   150 non-null    float64
 2   Petal.Length  150 non-null    float64
 3   Petal.Width   150 non-null    float64
 4   Species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 7.0+ KB
None


In [6]:
print(df_iris.describe())

       Sepal.Length  Sepal.Width  Petal.Length  Petal.Width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.057333      3.758000     1.199333
std        0.828066     0.435866      1.765298     0.762238
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.350000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.400000      6.900000     2.500000


In [8]:
sheet_url = 'https://docs.google.com/spreadsheets/d/1iYJPktqfAanL0dKg1jLXBtO1bk9H0b5zTFrDpFPU9Nk/export?format=csv&gid=1023018493'
df_google_sheets = pd.read_csv(sheet_url)
df_google_sheets_sample = df_google_sheets.head(100)

In [9]:
len(df_google_sheets)

7049

In [10]:
print(df_google_sheets.columns[:5])

Index(['customer_id', 'gender', 'is_senior_citizen', 'partner', 'dependents'], dtype='object')


In [11]:
print(df_google_sheets.select_dtypes(include=['object']).columns)

Index(['customer_id', 'gender', 'partner', 'dependents', 'payment_type',
       'churn'],
      dtype='object')


In [12]:
columns_with_num = df_google_sheets.select_dtypes(include =['float', 'int'])
range_for_gs = columns_with_num.max() - columns_with_num.min()
range_for_gs

is_senior_citizen       1.0
phone_service           2.0
internet_service        2.0
contract_type           2.0
monthly_charges       100.5
total_charges        8666.0
tenure                 79.3
dtype: float64

In [13]:
df_excel = pd.read_excel('spreadsheet_exercises.xlsx', sheet_name = 'Table1_CustDetails')
df_excel.head()

Unnamed: 0,customer_id,gender,is_senior_citizen,partner,dependents,phone_service,internet_service,contract_type,payment_type,monthly_charges,total_charges,churn,tenure
0,0002-ORFBO,Female,0.0,Yes,Yes,1.0,1.0,1.0,Mailed check,65.6,593.3,No,9.044207
1,0003-MKNFE,Male,0.0,No,No,2.0,1.0,0.0,Mailed check,59.9,542.4,No,9.055092
2,0004-TLHLJ,Male,0.0,No,No,1.0,2.0,0.0,Electronic check,73.9,280.85,Yes,3.800406
3,0011-IGKFF,Male,1.0,Yes,No,1.0,2.0,0.0,Electronic check,98.0,1237.85,Yes,12.631122
4,0013-EXCHZ,Female,1.0,Yes,No,1.0,2.0,0.0,Mailed check,83.9,267.4,Yes,3.187128


In [14]:
df_excel_sample = df_excel.head(100)

In [15]:
print(df_excel.shape)

(7049, 13)


In [16]:
print(df_excel.columns[:5])

Index(['customer_id', 'gender', 'is_senior_citizen', 'partner', 'dependents'], dtype='object')


In [17]:
print(df_excel.select_dtypes(include=['object']).columns)

Index(['customer_id', 'gender', 'partner', 'dependents', 'payment_type',
       'churn'],
      dtype='object')


In [18]:
columns_with_num_excel = df_excel.select_dtypes(include =['float', 'int'])
range_for_excel = columns_with_num.max() - columns_with_num.min()
range_for_excel

is_senior_citizen       1.0
phone_service           2.0
internet_service        2.0
contract_type           2.0
monthly_charges       100.5
total_charges        8666.0
tenure                 79.3
dtype: float64

In [19]:
sheet_url = 'https://docs.google.com/spreadsheets/d/1Uhtml8KY19LILuZsrDtlsHHDC9wuDGUSe8LTEwvdI5g/edit#gid=341089357' 

csv_export_url = sheet_url.replace('/edit#gid=', '/export?format=csv&gid=')

df_google = pd.read_csv(csv_export_url)
df_google.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [20]:
from acquire import get_titanic_data

In [21]:
df_titanic = get_titanic_data()
df_titanic.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [22]:
from acquire import get_iris_data

In [23]:
df_iris = get_iris_data()
df_iris.head()

Unnamed: 0.1,Unnamed: 0,species_id,species_name,sepal_length,sepal_width,petal_length,petal_width
0,0,1,setosa,5.1,3.5,1.4,0.2
1,1,1,setosa,4.9,3.0,1.4,0.2
2,2,1,setosa,4.7,3.2,1.3,0.2
3,3,1,setosa,4.6,3.1,1.5,0.2
4,4,1,setosa,5.0,3.6,1.4,0.2


In [24]:
from acquire import get_telco_data

In [25]:
df_telco = get_telco_data()
df_telco.head()

Unnamed: 0.1,Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,0,2,1,2,0002-ORFBO,Female,0,Yes,Yes,9,...,Yes,Yes,No,Yes,65.6,593.3,No,One year,DSL,Mailed check
1,1,2,1,1,0003-MKNFE,Male,0,No,No,9,...,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
2,2,1,2,1,0004-TLHLJ,Male,0,No,No,4,...,No,No,No,Yes,73.9,280.85,Yes,Month-to-month,Fiber optic,Electronic check
3,3,1,2,1,0011-IGKFF,Male,1,Yes,No,13,...,No,Yes,Yes,Yes,98.0,1237.85,Yes,Month-to-month,Fiber optic,Electronic check
4,4,2,2,1,0013-EXCHZ,Female,1,Yes,No,3,...,Yes,Yes,No,Yes,83.9,267.4,Yes,Month-to-month,Fiber optic,Mailed check


In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# train test split from sklearn
from sklearn.model_selection import train_test_split
# imputer from sklearn
from sklearn.impute import SimpleImputer

# filter out warnings
import warnings
warnings.filterwarnings('ignore')

# our own acquire script:
import acquire

In [27]:
from acquire import get_iris_data

In [42]:
df_iris = get_iris_data()
df_iris.head()

Unnamed: 0.1,Unnamed: 0,species_id,species_name,sepal_length,sepal_width,petal_length,petal_width
0,0,1,setosa,5.1,3.5,1.4,0.2
1,1,1,setosa,4.9,3.0,1.4,0.2
2,2,1,setosa,4.7,3.2,1.3,0.2
3,3,1,setosa,4.6,3.1,1.5,0.2
4,4,1,setosa,5.0,3.6,1.4,0.2


In [43]:
columns_to_drop = ['species_id', 'Unnamed: 0']

In [44]:
df_iris = df_iris.drop(columns = columns_to_drop)
df_iris

Unnamed: 0,species_name,sepal_length,sepal_width,petal_length,petal_width
0,setosa,5.1,3.5,1.4,0.2
1,setosa,4.9,3.0,1.4,0.2
2,setosa,4.7,3.2,1.3,0.2
3,setosa,4.6,3.1,1.5,0.2
4,setosa,5.0,3.6,1.4,0.2
5,setosa,5.4,3.9,1.7,0.4
6,setosa,4.6,3.4,1.4,0.3
7,setosa,5.0,3.4,1.5,0.2
8,setosa,4.4,2.9,1.4,0.2
9,setosa,4.9,3.1,1.5,0.1


In [45]:
df_iris.rename(columns = {'species_name':'species'}, inplace = True)
df_iris

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width
0,setosa,5.1,3.5,1.4,0.2
1,setosa,4.9,3.0,1.4,0.2
2,setosa,4.7,3.2,1.3,0.2
3,setosa,4.6,3.1,1.5,0.2
4,setosa,5.0,3.6,1.4,0.2
5,setosa,5.4,3.9,1.7,0.4
6,setosa,4.6,3.4,1.4,0.3
7,setosa,5.0,3.4,1.5,0.2
8,setosa,4.4,2.9,1.4,0.2
9,setosa,4.9,3.1,1.5,0.1


In [46]:
dummy_df = pd.get_dummies(df_iris[['species']],dummy_na=False, drop_first=[True])
dummy_df

Unnamed: 0,species_versicolor,species_virginica
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


In [48]:
df_iris = pd.concat([df_iris, dummy_df], axis=1)
df_iris

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width,species_versicolor,species_virginica,species_versicolor.1,species_virginica.1
0,setosa,5.1,3.5,1.4,0.2,0,0,0,0
1,setosa,4.9,3.0,1.4,0.2,0,0,0,0
2,setosa,4.7,3.2,1.3,0.2,0,0,0,0
3,setosa,4.6,3.1,1.5,0.2,0,0,0,0
4,setosa,5.0,3.6,1.4,0.2,0,0,0,0
5,setosa,5.4,3.9,1.7,0.4,0,0,0,0
6,setosa,4.6,3.4,1.4,0.3,0,0,0,0
7,setosa,5.0,3.4,1.5,0.2,0,0,0,0
8,setosa,4.4,2.9,1.4,0.2,0,0,0,0
9,setosa,4.9,3.1,1.5,0.1,0,0,0,0


In [49]:
def prep_iris(df_iris):
    columns_to_drop = ['species_id', 'Unnamed: 0']
    df_iris = df_iris.drop(columns = columns_to_drop)
    df_iris = df_iris.rename(columns = {'species_name':'species'}, inplace = True)
    dummy_df = pd.get_dummies(df_iris[['species']], dummy_na=False, drop_first=[True])
    df_iris = pd.concat([df_iris, dummy_df], axis=1)
    return df
    

In [50]:
df_iris = acquire.get_iris_data()

In [38]:
df_iris = prep_iris(df_iris)
df_iris

TypeError: 'NoneType' object is not subscriptable

In [None]:
def clean_titanic_data(df):
    '''
    Takes in a titanic dataframe and returns a cleaned dataframe
    Arguments: df - a pandas dataframe with the expected feature names and columns
    Return: clean_df - a dataframe with the cleaning operations performed on it
    '''
    # Drop duplicates
    df.drop_duplicates(inplace=True)
    # Drop columns 
    columns_to_drop = ['embarked', 'class', 'passenger_id', 'deck']
    df = df.drop(columns = columns_to_drop)
    # encoded categorical variables
    dummy_df = pd.get_dummies(df[['sex', 'embark_town']], dummy_na=False, drop_first=[True, True])
    df = pd.concat([df, dummy_df], axis=1)
    return df.drop(columns=['sex', 'embark_town'])   

In [None]:
def impute_age(train, validate, test):
    '''
    Imputes the mean age of train to all three datasets
    '''
    imputer = SimpleImputer(strategy='mean', missing_values=np.nan)
    imputer = imputer.fit(train[['age']])
    train[['age']] = imputer.transform(train[['age']])
    validate[['age']] = imputer.transform(validate[['age']])
    test[['age']] = imputer.transform(test[['age']])
    return train, validate, test

In [None]:
def prep_titanic_data(df): 
    df = clean_titanic_data(df)
    train, test = train_test_split(df,
                               train_size = 0.8,
                               stratify = df.survived,
                               random_state=1234)
    train, validate = train_test_split(train,
                                  train_size = 0.7,
                                  stratify = train.survived,
                                  random_state=1234)
    train, validate, test = impute_age(train, validate, test)
    return train, validate, test

In [None]:
df_titanic = acquire.get_titanic_data()
train, validate, test = prep_titanic_data(df_titanic)
train.head()