## Code was executed after cleaning all data to include baths attributes in analysis 

In [5]:
# pandas to work with dataframes
import pandas as pd
# import re to use regular expressions
import re
# import numpy
import numpy as np
# import glob to bring together multiple csv files
import glob

In [6]:
# search for csvs containing web-scraped data with the name scheme
csvs = glob.glob(r'*Craigslist_ApartmentData_Minneapolis*.csv')
# set dfs list to store each dataframe that gets read in
dfs = []
# iterate through csv files gathered in glob
for file in csvs:
    # read each csv as a dataframe with the same column names
    df1 = pd.read_csv(file, names=['address', 'square feet', 'beds_baths', 'datetime', 'square feet 2', 'latitude', 'title', 'longitude', 'map accuracy', 'price', 'url', 'post id'])
    # append dataframe to dfs list
    dfs.append(df1)

In [7]:
# concatenate dataframes in dfs together vertically
df = pd.concat(dfs, axis=0)
# check to see if address present, if it is, the boolean will be set to False
# this seems counterintuitive
df['address_check'] = [bool(re.match('nan', str(cell))) for cell in df['address']]
# sort values by address_check column
df = df.sort_values(['address_check'])
# set an index of integers equal to the length of dataframe rows
index_list = list(range(len(df.index)))
# make index series a column in dataframe
df['index'] = index_list
# set index column as index of dataframe
df = df.set_index(['index'])
# set entire df type as string, as most operations will require data type to be string
df = df.astype(str)

In [8]:
# drop any post id values that are null
postid_no_null = df.dropna(subset = ['post id'])
# create dataframe where "post id" column is null
null_id = df[df['post id'].isnull()]
# drop duplicates in null_id dataframe using the "title" column
null_id = null_id.drop_duplicates(subset=['title'], keep='first')
# drop duplicates using "post id" column in the dataframe containing no null "post id" values
postid_no_null = df.drop_duplicates(subset=['post id'], keep='first')
# concatenate vertically null_id and postid_no_null dataframes
df = pd.concat([null_id, postid_no_null])
# drop latitude, longitude, map accuracy columns 
df = df.drop(columns=['latitude', 'longitude', 'map accuracy'])

In [9]:
# column to store baths values
df['value_baths'] = [re.findall('\d*\.?\d+Ba', str(cell)) for cell in df['beds_baths']]
# create column of all patterns of baths values that exist in  beds_baths column
df['baths_boolean'] = [bool(re.search('\d*\.?\d+Ba', cell)) for cell in df['beds_baths']]
# create baths column and clean so as to only get number 
df['baths'] = ''
df.loc[df['baths_boolean'] == True, 'baths'] = df['value_baths'].apply(lambda x: str(x)[2:-4])

In [10]:
# slice dataframe to just get the columns we need to make the join 
df_baths = df[['baths', 'url', 'datetime']]
df_baths.to_csv("get_bath_values.csv")

In [11]:
df_full = pd.read_csv(r"C:\Users\murra667\Documents\Springboard\Capstone _1\Data_Collection\Craigslist Data\geocode_2\Data Collection and Cleaning\Data Sources\cleaned_data_final.csv")

In [12]:
df_baths_merged = df_baths.merge(df_full, how = 'right', left_on = ["url", "datetime"], right_on = ["url", "datetime"])

In [13]:
df_baths_merged.to_csv(r"C:\Users\murra667\Documents\Springboard\Capstone _1\Data_Collection\Craigslist Data\geocode_2\Data Collection and Cleaning\Data Sources\final_df_with_baths.csv")

## Perform One Final Clean of Data

In [49]:
df = pd.read_csv(r"C:\Users\murra667\Documents\Springboard\Capstone _1\Data_Collection\Craigslist Data\geocode_2\Data Collection and Cleaning\Data Sources\final_df_with_baths.csv")

In [50]:
# change the nulls to value Not Available
df['baths'] = df['baths'].fillna(value = "Not Available")

In [41]:
df.columns

Index(['Unnamed: 0', 'baths', 'url', 'datetime', 'index', 'address',
       'beds boolean', 'beds_baths', 'county', 'county_full', 'geocode_lat',
       'geocode_lon', 'post id', 'price', 'price boolean', 'square feet',
       'square feet boolean', 'title', 'County', 'City', 'one to studio',
       'bed assigned', 'index match', 'no match', 'outlier'],
      dtype='object')

In [51]:
# drop county_full column
df = df.drop(columns = ['index', 'Unnamed: 0', 'county_full', 'outlier', 'no match', 'index match', 'no match', 'County'])
df = df.rename(columns = {'beds_baths' : 'beds'})

In [54]:
df.to_csv(r"C:\Users\murra667\Documents\Springboard\Capstone _1\Data_Collection\Craigslist Data\geocode_2\Data Collection and Cleaning\Data Sources\machine_learning_df.csv")