## Data Cleaning and Wrangling - Solar Panel Housing CSVs
This notebook contains data cleaning and wrangling for MLS housing data.

In [1]:
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import seaborn as sns
import random
import pickle
import warnings
warnings.filterwarnings("ignore")

In [2]:
pop_df = pd.read_csv('~/data_science_projects/solar_panel_data/pop_sample_a.csv')

In [3]:
pop_df.drop(columns=['Stat', 'CP', 'SP:LP', 'Elec Suppl', 'Closed Date',
                     'Electricity', '% PCH - OLP', '% PCH - PLP'], inplace=True)

In [4]:
pop_df.drop(index=[1614, 2046], inplace=True)

In [5]:
pop_df.rename(columns={"SP:OLP": "% SP:OLP"}, inplace=True)

In [6]:
pop_df['% SP:OLP'] = pop_df['% SP:OLP'].map(lambda x: x.replace('%', ''))
pop_df['% SP:OLP'] = pop_df['% SP:OLP'].astype(int)

In [7]:
def new_const(row):
    if row['New Const'] == 'Yes':
        return 1
    else:
        return 0
pop_df['New'] = pop_df.apply(new_const, axis = 1)

In [8]:
solar_df1 = pd.read_csv('~/data_science_projects/solar_panel_data/solar_remarks.csv')

In [9]:
def has_solar(row):
    words = ['Photovoltaic', 'solar panel', 'solar panels', 'passive solar', 'solar system', 'solar energy']
    if any(word in row['Remarks'] for word in words) == True:
        return 1
    else:
        return 0
solar_df1['has_solar'] = solar_df1.apply(has_solar, axis = 1)

In [10]:
solar_df1 = solar_df1[solar_df1['has_solar'] == 1]

In [11]:
solar_df2 = pd.read_csv('~/data_science_projects/solar_panel_data/solar_field.csv')

In [12]:
solar_df = solar_df1.append(solar_df2, ignore_index=True)

In [13]:
solar_df.drop_duplicates(subset=['MLS #'], inplace=True)

In [14]:
solar_df.drop(columns=['Stat', 'CP', 'SP:LP', 'Elec Suppl',
                       'Closed Date', 'Electricity', '% PCH - OLP',
                       '% PCH - PLP', 'has_solar'], inplace=True)

In [15]:
solar_df.rename(columns={"SP:OLP": "% SP:OLP"}, inplace=True)

In [16]:
solar_df['% SP:OLP'] = solar_df['% SP:OLP'].map(lambda x: x.replace('%', ''))
solar_df['% SP:OLP'] = solar_df['% SP:OLP'].astype(int)

In [17]:
def new_const(row):
    if row['New Const'] == 'Yes':
        return 1
    else:
        return 0
solar_df['New'] = solar_df.apply(new_const, axis = 1)

In [18]:
pop_df.to_pickle("./witho_df.pkl")

*hypothesis testing
*calculate cohen's d 
*z-test, t-test
*p value, accuracy
*result of hypothesis
*

In [19]:
solar_df.to_pickle("./with_df.pkl")