In [4]:
from env import host, username, password
import numpy as np
import seaborn as sns
import scipy.stats as stats
import pandas as pd
import matplotlib.pyplot as plt
from pydataset import data
import acquire as ac

In [33]:
iris_docs = data('iris', show_doc = True)

iris

PyDataset Documentation (adopted from R Documentation. The displayed examples are in R)

## Edgar Anderson's Iris Data

### Description

This famous (Fisher's or Anderson's) iris data set gives the measurements in
centimeters of the variables sepal length and width and petal length and
width, respectively, for 50 flowers from each of 3 species of iris. The
species are _Iris setosa_, _versicolor_, and _virginica_.

### Usage

    iris
    iris3

### Format

`iris` is a data frame with 150 cases (rows) and 5 variables (columns) named
`Sepal.Length`, `Sepal.Width`, `Petal.Length`, `Petal.Width`, and `Species`.

`iris3` gives the same data arranged as a 3-dimensional array of size 50 by 4
by 3, as represented by S-PLUS. The first dimension gives the case number
within the species subsample, the second the measurements with names `Sepal
L.`, `Sepal W.`, `Petal L.`, and `Petal W.`, and the third the species.

### Source

Fisher, R. A. (1936) The use of multiple measurements in taxonomi

In [31]:
iris_df = ac.get_iris_data()
print(f'The first 3 rows are...\n {iris_df.head(3)}')

The first 3 rows are...
    species_id  measurement_id  sepal_length  sepal_width  petal_length  \
0           1               1           5.1          3.5           1.4   
1           1               2           4.9          3.0           1.4   
2           1               3           4.7          3.2           1.3   

   petal_width species_name  
0          0.2       setosa  
1          0.2       setosa  
2          0.2       setosa  


In [11]:
shape = iris_df.shape
print(f'There are {shape[0]} rows and {shape[1]} columns')

there are 150 rows and 5 columns


In [22]:
keys = iris_df.keys()
print(f'The columns are: {list(keys)}')

The columns are: ['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width', 'Species']


In [30]:
dtypes = iris_df.info()
print(dtypes)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150 entries, 1 to 150
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Sepal.Length  150 non-null    float64
 1   Sepal.Width   150 non-null    float64
 2   Petal.Length  150 non-null    float64
 3   Petal.Width   150 non-null    float64
 4   Species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 7.0+ KB
None


In [34]:
iris_df.describe()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


I would not rescale since all units are in cm

In [96]:
df_excel = pd.read_excel('mytable_customer_details.xlsx')
df_excel_sample = pd.read_excel('mytable_customer_details.xlsx').head(100)

In [48]:
rows = df_excel.shape
print(f'There are {rows[0]} rows in the original dataframe')

There are 7049 rows in the original dataframe


In [58]:
keys = list(df_excel.keys())
print(f'The first five columnns are: {keys[0]} , {keys[1]} , {keys[2]} , {keys[3]} , {keys[4]}')

The first five columnns are: customer_id , gender , is_senior_citizen , partner , dependents


In [62]:
excel_dtypes = df_excel.dtypes

In [87]:
excel_dtypes[excel_dtypes.values == 'object']

customer_id      object
gender           object
partner          object
dependents       object
payment_type     object
churn            object
avg_charges      object
Phone desc       object
contract type    object
internet desc    object
dtype: object

In [85]:
df_excel.describe()

Unnamed: 0,is_senior_citizen,phone_service,internet_service,contract_type,monthly_charges,total_charges,tenure,partner_dependents
count,7049.0,7049.0,7049.0,7049.0,7049.0,7038.0,7049.0,7049.0
mean,0.162009,1.324585,1.222585,0.690878,64.747014,2283.043883,32.379866,1.083416
std,0.368485,0.642709,0.779068,0.833757,30.09946,2266.521984,24.595524,1.226883
min,0.0,0.0,0.0,0.0,18.25,18.8,0.0,0.0
25%,0.0,1.0,1.0,0.0,35.45,401.5875,8.733456,0.0
50%,0.0,1.0,1.0,0.0,70.35,1397.1,28.683425,1.0
75%,0.0,2.0,2.0,1.0,89.85,3793.775,55.229399,2.0
max,1.0,2.0,2.0,2.0,118.75,8684.8,79.341772,3.0


In [94]:
tenure_range = (df_excel.tenure.max()) - (df_excel.tenure.min())
print(f' The range of tenure is {tenure_range:.2f}')

 The range of tenure is 79.34


In [95]:
monthly_charges_range = (df_excel.monthly_charges.max()) - (df_excel.monthly_charges.min())
print(f'The range of monthly charges is {monthly_charges_range:.2f}')

The range of monthly cahrges is 100.50


In [97]:
total_charges_range = (df_excel.total_charges.max()) -(df_excel.total_charges.min())
print(f'The range of total charges is {total_charges_range:.2f}')

The range of total charges is 8666.00


In [10]:
sheet_url = 'https://docs.google.com/spreadsheets/d/1Uhtml8KY19LILuZsrDtlsHHDC9wuDGUSe8LTEwvdI5g/edit?usp=sharing'    
csv_url = sheet_url.replace('/edit?usp=sharing', '/export?format=csv&gid=341089357')

In [12]:
train_df = pd.read_csv(csv_url)
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [13]:
train_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [14]:
shape = train_df.shape
print(f'the dataset has {shape[0]} rows and {shape[1]} columns')

the dataset has 891 rows and 12 columns


In [15]:
keys = train_df.keys()
print(f'The column names are {list(keys)}' )

The column names are ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [18]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [19]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [28]:
unique_sex = list(train_df.Sex.unique())
unique_sex

['male', 'female']

In [30]:
embarked = list(train_df.Embarked.unique())
embarked

['S', 'C', 'Q', nan]

## Exercises

The end product of this exercise should be the specified functions in a python script named `prepare.py`.
Do these in your `classification_exercises.ipynb` first, then transfer to the prepare.py file. 

This work should all be saved in your local `classification-exercises` repo. Then add, commit, and push your changes.

Using the Iris Data:  

1. Use the function defined in `acquire.py` to load the iris data.  

1. Drop the `species_id` and `measurement_id` columns.  

1. Rename the `species_name` column to just `species`.  

1. Create dummy variables of the species name. 

1. Create a function named `prep_iris` that accepts the untransformed iris data, and returns the data with the transformations above applied.  

In [32]:
iris_df = ac.get_iris_data()

In [6]:
iris_df = iris_df.drop(columns = ['species_id', 'measurement_id'])

In [16]:
iris_df = iris_df.rename(columns = {'species_name':'species'})
iris_df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa


In [20]:
iris_df = pd.get_dummies(data = iris_df, columns = ['species'], drop_first=True)

In [21]:
iris_df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species_versicolor,species_virginica
0,5.1,3.5,1.4,0.2,0,0
1,4.9,3.0,1.4,0.2,0,0
2,4.7,3.2,1.3,0.2,0,0
3,4.6,3.1,1.5,0.2,0,0
4,5.0,3.6,1.4,0.2,0,0
5,5.4,3.9,1.7,0.4,0,0
6,4.6,3.4,1.4,0.3,0,0
7,5.0,3.4,1.5,0.2,0,0
8,4.4,2.9,1.4,0.2,0,0
9,4.9,3.1,1.5,0.1,0,0


In [34]:
def prep_iris(iris_df):
    '''
    Takes in the iris_df, drops species id, and measurement id, then adds a dummy
    variable column. It then returns the iris_df cleaned as iris_df
    '''
    iris_df = iris_df.drop(columns = ['species_id', 'measurement_id'])
    iris_df = iris_df.rename(columns = {'species_name':'species'})
    iris_df = pd.get_dummies(data = iris_df, columns = ['species'], drop_first=True)
    return iris_df
    

In [35]:
iris_df = prep_iris(iris_df)

In [36]:
iris_df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species_versicolor,species_virginica
0,5.1,3.5,1.4,0.2,0,0
1,4.9,3.0,1.4,0.2,0,0
2,4.7,3.2,1.3,0.2,0,0
3,4.6,3.1,1.5,0.2,0,0
4,5.0,3.6,1.4,0.2,0,0
5,5.4,3.9,1.7,0.4,0,0
6,4.6,3.4,1.4,0.3,0,0
7,5.0,3.4,1.5,0.2,0,0
8,4.4,2.9,1.4,0.2,0,0
9,4.9,3.1,1.5,0.1,0,0
