In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import linear_model
import matplotlib.pyplot as plt
from sqlalchemy import create_engine

import statsmodels.api as sm

In [None]:
def print_full(x):
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_rows')

In [None]:
def print_outliers(df):
    
    print('Continuious Variables', '\n')
    
    for column in df.select_dtypes(include=['int64', 'float64', 'float']):
        print(column, df.columns.get_loc(column))
        for threshold in [95, 90, 85, 80, 75]:
            upper_value = np.percentile(df[column], [threshold])[0]
            print('Beyond the lower {}th percentile there are {} outliers'.format(threshold, 
                len((np.where((df[column] > upper_value))[0]))
            ))
        if (df[column].nunique() < 20):
            print('\n')
            print('Possible Categorical Variable')
            print('There are {} unique values'.format(df[column].nunique()))
            print(df[column].value_counts())
        print('\n')
        
        
    
    print('Categorical Variables', '\n')
    
    for column in df.select_dtypes(include='object'):
        print(column)
        print('There are {} unique values'.format(df[column].nunique()))
        if df[column].nunique() < 20:
            print(df[column].value_counts())
            print('\n')

### 1. Load the dataset from the Thinkful PostgreSQL database.

Investigate the data, and do any necessary data cleaning.
Explore the data and find some variables that you think would be useful in predicting house prices.
Build your initial model using these features and estimate the parameters using OLS.
Spend up to 4 hours on this assignment. You will submit the notebook after the assessment questions.

In [None]:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'houseprices'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))

query1 = '''
SELECT
  *
FROM
  houseprices
'''

df = pd.read_sql_query(query1, con=engine)
engine.dispose()

### 2. Investigate the data, and do any necessary data cleaning.

In [None]:
df.info()
df.head(10)

In [None]:
print_outliers(df)

In [None]:
# drop the id variable
df.drop('id', axis=1, inplace=True)

In [None]:
# Sort my features into categorical and continuous

In [None]:
def get_cats_conts_unsorted(df):
     
        # Some paramater tuning is no doubt called for at some point. For now these are rough
        # values for sorting and we will clean up the rest manually. Better than doing the whole
        # thing manually!
        
    cats = []
    conts = []
    unsorted = []
    
    
    for column in df.columns:
        if df[column].nunique() < 10:
            cats.append(column)
        
        elif df[column].nunique() < 50 or df[column].nunique() <  len(df)/10:
            unsorted.append(column)
        
        else:
            conts.append(column)
            
            # there is some room here to integrate a process for mauallyl assiging categorical
            # or continuous to the unsorted columns.
            
            
    print('{} cats'.format(len(cats))
         , '\n', '{} conts'.format(len(conts))
         , '\n', '{} unsorted'.format(len(unsorted))
         )
    
    return cats, conts, unsorted

In [None]:
cats, conts, unsorted = get_cats_conts_unsorted(df)
    

In [None]:
df.loc[:, unsorted].nunique().sort_values(ascending=False)

In [None]:
new_cats = ['mssubclass', 'mosold', 'overallqual', 'totrmsabvgrd', 'exterior2nd', 'exterior1st', 'neighborhood']

new_conts =['lotfrontage', 'yearbuilt', 'garageyrblt', 'yearremodadd', 'miscval'
            ,'screenporch', 'lowqualfinsf', 'threessnporch', 'bsmtfinsf2', 'enclosedporch', ]

for cat in new_cats: 
    cats.append(cat)

for cont in new_conts:
    conts.append(cont)

In [None]:
# check to make sure our two buckets contain the right number of features
len(cats) + len(conts)

In [None]:
if 'saleprice' in conts:
    print('yes')

In [None]:
print_outliers(df.loc[:, conts])

### 3. Explore the data and find some variables that you think would be useful in predicting house prices.

In [1]:
def multivariate_visualization(df, target, cats, conts):

    if target in conts:
        
        # cont / cont
        
        map_df = df.loc[:, conts]
        map_df.set_index(target)
        
        plt.figure(figsize=(len(conts)*2, len(conts)*2))
        corr_map = map_df.corr()
        sns.heatmap(corr_map, square=True, annot=True, linewidths=.5)
        
        
        # cont / cat
        plt.figure(figsize=(15, len(cats)*4))

        for i, feature in enumerate(cats):
            plt.subplot(len(cats), 1, i+1)
            sns.barplot(df[feature], df[target])
            plt.title(feature)
            plt.xlabel('')
            plt.xticks(rotation=90)
        plt.tight_layout(pad=0.3)
        plt.show()

    # elif target in cats:
    
    # need to write second half of the function

In [None]:
multivariate_visualization(df, 'saleprice', cats, conts)

### 4. Build your initial model using these features and estimate the parameters using OLS.