In [1]:
#Step 1 -- Create a list of 1000 random properties in the pittsburgh area

In [2]:
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt
import string

In [3]:
#Street Names and Suffix
street_suffix = ['ST', 'RD', 'WAY', 'LN', 'AVE', 'DR', 'BLVD', 'RUN', 'HYWY', 'TRAIL', 'COURT', 'TR']
street_names = ['Long', 'Sarah', 'Larkins', 'Sunset', 'Rainbow', 'Platypus', 'Mountain', 'Delta', 'Sky', 'Money', 'Python', 'River', 'Crest', 'Run', 'Crane', 'Summer', 'Green', 'Axe', 'Slide', 'Captain', 'Dog', 'Carson', 'Perry', 'Grape', 'Broadhead', 'Timber', 'Sunshine', 'Babcock', 'Black', 'Grass', 'Brookyn']

In [4]:
#Combine the 2 randomly to get street names and suffix
seed_value = 1
random.seed(seed_value)
random_streets = [f"{random.choice(street_names)} {random.choice(street_suffix)}" for _ in range(5000)]

In [5]:
#Street Numbers
seed_value = 1
random.seed(seed_value)
random_st_num = [random.randint(1, 99099) for _ in range(5000)]


In [6]:
#Now we want to create a list matching the random numbers we just created with the random street name but we do not want duplicates
seed_value = 1
random.seed(seed_value)
address = random.sample(list(zip(random_st_num, random_streets)), len(random_streets))
random.shuffle(address)
used_addresses = set()
shuffled_list = [(num, street) for num, street in address if (num, street) not in used_addresses and not used_addresses.add((num, street))]

In [7]:
#Will need to remove the punctuation from the address

In [8]:
#seed_value = 1
#random.seed(seed_value)
#beds = [random.randint(1, 5) for _ in range(5000)]
#print(beds)

In [9]:
#Baths
seed_value = 1
random.seed(seed_value)
baths = [random.randint(1,3) for _ in range(5000)]

In [10]:
#Start Inserting data into out dataframe
data = {'Address': shuffled_list, 'Baths': baths}
df = pd.DataFrame(data)
df

Unnamed: 0,Address,Baths
0,"(1425, Summer RD)",1
1,"(13856, Grape BLVD)",3
2,"(23018, Dog ST)",1
3,"(92756, Brookyn BLVD)",2
4,"(63377, Carson BLVD)",1
...,...,...
4995,"(69277, Summer DR)",3
4996,"(63848, Dog ST)",2
4997,"(89043, Broadhead WAY)",1
4998,"(18465, Captain HYWY)",2


In [11]:
df['Baths'] = df['Baths'].astype(str)

In [12]:
#We want to create bedrooms but the number of baths cannot exceed the number of beds
seed_value = 1
random.seed(seed_value)
def generate_random_beds(row):
    if row == 1:
        return random.randint(1, 2)
    elif row == 2:
        return random.randint(2, 4)
    elif row == 3:
        return random.randint(3, 5)
    else:
        return 0  # Default value, you can adjust as needed

conditions = [
    (df['Baths'].astype(int) == 1),
    (df['Baths'].astype(int) == 2),
    (df['Baths'].astype(int) == 3)
]

beds_mapping = np.vectorize(generate_random_beds)
df['Beds'] = beds_mapping(df['Baths'].astype(int))
print(df)

                     Address Baths  Beds
0          (1425, Summer RD)     1     1
1        (13856, Grape BLVD)     3     4
2            (23018, Dog ST)     1     1
3      (92756, Brookyn BLVD)     2     3
4       (63377, Carson BLVD)     1     2
...                      ...   ...   ...
4995      (69277, Summer DR)     3     3
4996         (63848, Dog ST)     2     4
4997  (89043, Broadhead WAY)     1     2
4998   (18465, Captain HYWY)     2     4
4999    (71528, Black COURT)     3     3

[5000 rows x 3 columns]


In [13]:
df

Unnamed: 0,Address,Baths,Beds
0,"(1425, Summer RD)",1,1
1,"(13856, Grape BLVD)",3,4
2,"(23018, Dog ST)",1,1
3,"(92756, Brookyn BLVD)",2,3
4,"(63377, Carson BLVD)",1,2
...,...,...,...
4995,"(69277, Summer DR)",3,3
4996,"(63848, Dog ST)",2,4
4997,"(89043, Broadhead WAY)",1,2
4998,"(18465, Captain HYWY)",2,4


In [14]:
#Convert Address from tuple to string so that punctuation can be removed
df['Address'] = df['Address'].astype(str)

In [15]:
#Remove punctuation
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

df["Address"] = df['Address'].apply(remove_punctuations)

In [16]:
pattern = r'(?P<StreetNumber>\d+)\s+(?P<StreetName>.+)'
df[['StreetNumber', 'StreetName']] = df['Address'].str.extract(pattern)
df = df.drop('Address', axis=1)

In [17]:
df['Address'] = df['StreetNumber'] + " " + df['StreetName']

In [18]:
#Create a random ZIP Code
#Create a random neighborhood -- if addresses are in the same street we want they to be in the same neighborhood and zip code
ZIP = ['15106', '15203', '15093', '15672', '15090', '15301']
Neighborhood = ['Carnegie', 'South Side', 'North Side', 'Downtown', 'South Fayette', 'North Fayette']

In [19]:
def generate_random_code(group):
    return np.random.choice(ZIP)

df['ZIP Code'] = df.groupby('StreetName')['StreetName'].transform(lambda x: generate_random_code(x))


In [20]:
df = df.drop(['StreetNumber', 'StreetName'], axis=1)

In [21]:
conditions = [
    (df['ZIP Code'] == '15106'),
    (df['ZIP Code'] == '15203'),
    (df['ZIP Code'] == '15093'),
    (df['ZIP Code'] == '15672'),
    (df['ZIP Code'] == '15090'),
    (df['ZIP Code'] == '15301')
]

values = ['Carnegie', 'South Side', 'North Side', 'Downtown', 'South Fayette', 'North Fayette']
df['Neighborhood'] = np.select(conditions, values, default='other')

print(df)

     Baths  Beds              Address ZIP Code   Neighborhood
0        1     1       1425 Summer RD    15301  North Fayette
1        3     4     13856 Grape BLVD    15301  North Fayette
2        1     1         23018 Dog ST    15203     South Side
3        2     3   92756 Brookyn BLVD    15301  North Fayette
4        1     2    63377 Carson BLVD    15093     North Side
...    ...   ...                  ...      ...            ...
4995     3     3      69277 Summer DR    15106       Carnegie
4996     2     4         63848 Dog ST    15203     South Side
4997     1     2  89043 Broadhead WAY    15093     North Side
4998     2     4   18465 Captain HYWY    15090  South Fayette
4999     3     3    71528 Black COURT    15090  South Fayette

[5000 rows x 5 columns]


In [22]:
df['Baths'] = df['Baths'].astype(str).str.replace(',', '').astype(int)

In [23]:
df['Base_Price'] = [random.randint(60000 , 350000)for _ in range(5000)]

In [24]:
#We are adjusting the price of a property relative to its number of beds baths and the neighborhood it is located in
def calculate_final_price(row):
    baths_multiplier = 1.35 if row['Baths'] == 3 else 1.25 if row['Baths'] == 2 else 1.15 if row['Baths'] == 1 else 1.0
    beds_multiplier = 1.4 if row ['Beds'] == 5 else 1.35 if row ['Beds'] == 4 else 1.3 if row['Beds'] == 3 else 1.25 if row['Beds'] == 2 else 1.2 if row['Beds'] == 1 else 1.0
    neighborhood_multiplier = 1.1 if row ['Neighborhood'] == 'Carnegie' else 1.1 if row ['Neighborhood'] == 'South Side' else 1.1 if row ['Neighborhood'] == 'North Side' else 1.15 if row ['Neighborhood'] == 'Downtown' else 1.2 if row ['Neighborhood'] == 'South Fayette' else 1.2 if row ['Neighborhood'] == 'North Fayette' else 1.0
    return row['Base_Price'] * baths_multiplier * beds_multiplier *neighborhood_multiplier

df['Final_Price'] = df.apply(calculate_final_price, axis=1)


In [25]:
df['sq_ft'] = [random.randint(900, 1400) for _ in range(5000)]

In [26]:
df

Unnamed: 0,Baths,Beds,Address,ZIP Code,Neighborhood,Base_Price,Final_Price,sq_ft
0,1,1,1425 Summer RD,15301,North Fayette,166700,276055.20000,1224
1,3,4,13856 Grape BLVD,15301,North Fayette,226547,495458.28900,1218
2,1,1,23018 Dog ST,15203,South Side,143185,217354.83000,1258
3,2,3,92756 Brookyn BLVD,15301,North Fayette,144144,281080.80000,1368
4,1,2,63377 Carson BLVD,15093,North Side,233224,368785.45000,1007
...,...,...,...,...,...,...,...,...
4995,3,3,69277 Summer DR,15106,Carnegie,76430,147548.11500,1061
4996,2,4,63848 Dog ST,15203,South Side,148556,275757.07500,1035
4997,1,2,89043 Broadhead WAY,15093,North Side,335551,530590.01875,1101
4998,2,4,18465 Captain HYWY,15090,South Fayette,115805,234505.12500,1073


In [27]:
df = df.drop('Base_Price', axis=1)

In [28]:
df['Final_Price'] = df['Final_Price'].round()

In [29]:
seed_value = 1
random.seed(seed_value)
basement_choices = ["Yes", "No"]
basement = [random.randint(250, 1000) if choice == "Yes" else 0 for choice in random.choices(basement_choices, k=5000)]

In [30]:
df['basement'] = basement

In [31]:
def square_footage(row):
    bath_multiplier = 150 if row['Baths'] == 3 else 100 if row['Baths'] == 2 else 50 if row ['Baths'] == 1 else 1
    bed_multiplier = 750 if row['Beds'] == 5 else 600 if row['Beds'] == 4 else 450 if row['Beds'] == 3 else 300 if row['Beds'] == 2 else 150 if row['Beds'] == 1 else 1
    
    return row['sq_ft'] + bath_multiplier + bed_multiplier

df['Square_Footage'] = df.apply(square_footage, axis = 1)

In [32]:
df['Square_Footage'] = df['Square_Footage'] + df['basement']

In [33]:
df['basement'] = np.where(df['basement'] > 0, "yes", "no")

In [34]:
df

Unnamed: 0,Baths,Beds,Address,ZIP Code,Neighborhood,Final_Price,sq_ft,basement,Square_Footage
0,1,1,1425 Summer RD,15301,North Fayette,276055.0,1224,yes,2185
1,3,4,13856 Grape BLVD,15301,North Fayette,495458.0,1218,no,1968
2,1,1,23018 Dog ST,15203,South Side,217355.0,1258,no,1458
3,2,3,92756 Brookyn BLVD,15301,North Fayette,281081.0,1368,yes,2560
4,1,2,63377 Carson BLVD,15093,North Side,368785.0,1007,yes,1813
...,...,...,...,...,...,...,...,...,...
4995,3,3,69277 Summer DR,15106,Carnegie,147548.0,1061,no,1661
4996,2,4,63848 Dog ST,15203,South Side,275757.0,1035,no,1735
4997,1,2,89043 Broadhead WAY,15093,North Side,530590.0,1101,no,1451
4998,2,4,18465 Captain HYWY,15090,South Fayette,234505.0,1073,no,1773


In [35]:
neighborhood_stats = df.groupby('Neighborhood').agg({'Beds': 'sum', 'Square_Footage': 'mean', 'Address': 'count'}).rename(columns={'Address': 'Property_Count'})

print(neighborhood_stats)

               Beds  Square_Footage  Property_Count
Neighborhood                                       
Carnegie       2571     1984.091611             906
Downtown       2219     2012.063389             773
North Fayette  2100     1980.416216             740
North Side     2471     1996.534342             859
South Fayette  2685     1989.080732             929
South Side     2277     1980.353090             793


In [36]:
df = df.drop('sq_ft', axis=1)

In [37]:
df.groupby('Neighborhood').agg({'Final_Price': 'sum', 'Neighborhood': 'count'}).sort_values(by='Final_Price', ascending=False)

Unnamed: 0_level_0,Final_Price,Neighborhood
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1
South Fayette,367741112.0,929
Carnegie,334452539.0,906
North Side,307960203.0,859
Downtown,303073583.0,773
North Fayette,289669695.0,740
South Side,286480320.0,793
