- prop_id
- prop_type[house,appartment,Condo]
- neighborhood[Suburb A,suburb B,Downtown,Rural]
- bedrooms[1,5]
- bathrooms[1,4]
- sq_ft[1000,3000]
- Year_built[1977,2022]
- sale_price[1000000,50000000]

In [31]:
import pandas as pd
import random as rd

# create dummy data if needed 

In [33]:
data={
    'Prop_id':range(1,101),
    'prop_type':[rd.choice(['House','Appartment','Condo']) for _ in range(100)],
    'Neighborhood':[rd.choice(['Suburb A','Suburb B','Downtown','Rural']) for _ in range(100)],
    'Bedrooms':[rd.randint(1,5) for _ in range(100)],
    'Bathrooms':[rd.randint(1,4) for _ in range(100)],
    'Sq_ft':[rd.randint(1000,3000) for _ in range(100)],
    'Year_built':[rd.randint(1977,2022) for _ in range(100)],
    'sale_price':[rd.randint(100000,300000) for _ in range(100)]
}
df=pd.DataFrame(data)
df.to_csv("./dummy data set/Real_estate.csv",index=False)


In [34]:
df=pd.read_csv("./dummy data set/Real_estate.csv")
df

Unnamed: 0,Prop_id,prop_type,Neighborhood,Bedrooms,Bathrooms,Sq_ft,Year_built,sale_price
0,1,Condo,Rural,4,4,1934,2016,275729
1,2,House,Rural,1,1,1402,2022,134265
2,3,House,Suburb B,4,4,1463,1988,291206
3,4,Appartment,Downtown,2,1,2497,1988,208841
4,5,Appartment,Suburb B,2,4,1787,1984,192586
...,...,...,...,...,...,...,...,...
95,96,Appartment,Rural,2,3,2488,1982,183387
96,97,House,Suburb B,2,4,2812,2015,173658
97,98,House,Rural,4,1,2423,1983,273458
98,99,House,Downtown,1,1,2891,2000,156762


 # Handle missing values in the dataset, deciding on an appropriate strategy (e.g., imputation or removal).

In [35]:
df.isna().sum()

Prop_id         0
prop_type       0
Neighborhood    0
Bedrooms        0
Bathrooms       0
Sq_ft           0
Year_built      0
sale_price      0
dtype: int64

# Filter and subset the data based on specific criteria, such as a particular time period, property type, or location.

In [36]:
# filter data based on prop type
Prop_type='House'
prop_suubset=df[df['prop_type']==Prop_type]

# filter data based on neighborhood
neighborhood='Suburb A'
neigh_subset=df[df['Neighborhood']==neighborhood]

# filter data base on time period
start_year=2001
end_year=2022
year_subset=df[(df['Year_built']>=start_year)& (df['Year_built']<=end_year)]

print("Pro_Type: ")
print(prop_suubset)

print('neighborhood: ')
print(neigh_subset)

print('Time Period: ')
print(year_subset)

Pro_Type: 
    Prop_id prop_type Neighborhood  Bedrooms  Bathrooms  Sq_ft  Year_built  \
1         2     House        Rural         1          1   1402        2022   
2         3     House     Suburb B         4          4   1463        1988   
6         7     House     Suburb A         5          1   2980        2004   
9        10     House     Suburb A         3          1   2583        1994   
11       12     House        Rural         3          1   1749        2001   
21       22     House     Suburb B         3          2   2227        2008   
22       23     House        Rural         1          2   2364        1980   
24       25     House     Downtown         5          4   2532        1995   
27       28     House        Rural         4          4   1094        2012   
30       31     House     Downtown         3          2   2117        1995   
32       33     House     Suburb B         3          4   2853        2007   
35       36     House     Suburb A         5         

#  Handle categorical variables by encoding them appropriately (e.g., one-hot encoding or label encoding) for further analysis.

In [37]:
df_encoded=pd.get_dummies(df,columns=['prop_type','Neighborhood'])
print(df_encoded)

    Prop_id  Bedrooms  Bathrooms  Sq_ft  Year_built  sale_price  \
0         1         4          4   1934        2016      275729   
1         2         1          1   1402        2022      134265   
2         3         4          4   1463        1988      291206   
3         4         2          1   2497        1988      208841   
4         5         2          4   1787        1984      192586   
..      ...       ...        ...    ...         ...         ...   
95       96         2          3   2488        1982      183387   
96       97         2          4   2812        2015      173658   
97       98         4          1   2423        1983      273458   
98       99         1          1   2891        2000      156762   
99      100         2          2   1794        2022      263142   

    prop_type_Appartment  prop_type_Condo  prop_type_House  \
0                  False             True            False   
1                  False            False             True   
2        

#  Aggregate the data to calculate summary statistics or derived metrics such as average sale prices by neighborhood or property type.

In [38]:
avg_price_by_neighborhood=df.groupby('Neighborhood')['sale_price'].mean()
print(avg_price_by_neighborhood)

Neighborhood
Downtown    200498.884615
Rural       196122.928571
Suburb A    167452.500000
Suburb B    207659.708333
Name: sale_price, dtype: float64


#  Identify and handle outliers:

In [39]:
from scipy import stats

z_scores=stats.zscore(df['Bathrooms'])
threshold=3
outliers =(z_scores>threshold) | (z_scores< -threshold)
print(outliers)


0     False
1     False
2     False
3     False
4     False
      ...  
95    False
96    False
97    False
98    False
99    False
Name: Bathrooms, Length: 100, dtype: bool
