In [None]:
# Importing the necessary libraries for data visualization and analysis
import matplotlib.pyplot as plt  # Matplotlib
import seaborn as sns            # Seaborn 
import numpy as np               # NumPy 
import pandas as pd              # Pandas 
import matplotlib.ticker as mticker  # Ticker


In [None]:
df_housing = pd.read_csv('housing.csv', index_col=0)# Reads the CSV file 'housing.csv' into a DataFrame

pd.set_option("display.max_rows", None)      # Show all rows in the DataFrame output
pd.set_option('display.max_columns', None)   # Show all columns in the DataFrame output

df_housing #Displays the DataFrame 'df_housing'


In [None]:
# Creates a new DataFrame 'housing_data' containing selected columns from 'df_housing'
housing_data = pd.DataFrame(df_housing[['SalePrice', 'YearBuilt', 'OverallQual', 'OverallCond', 'YearRemodAdd', 'Neighborhood']])   

housing_data # Displays the 'housing_data' DataFrame


In [None]:
# Sorts the 'housing_data' DataFrame by 'SalePrice' in descending order
housing_data.sort_values(by='SalePrice', ascending=False, inplace=True)

# Converts the 'OverallCond' column to integer type
housing_data['OverallCond'] = housing_data['OverallCond'].astype(int)

housing_data # Displays the updated 'housing_data' DataFrame


In [None]:
# Groups the 'housing_data' DataFrame by 'Neighborhood' and calculate the total 'SalePrice' for each neighborhood
neighborhood_values = housing_data.groupby('Neighborhood')['SalePrice'].sum().reset_index()

# Converts 'SalePrice' from dollars to millions for better readability
neighborhood_values['SalePrice'] = neighborhood_values['SalePrice'] / 1_000_000

# Sets the figure size for the plot
plt.figure(figsize=(14, 7))

# Creates a bar plot for total sale price by neighborhood
bars = plt.bar(neighborhood_values['Neighborhood'], neighborhood_values['SalePrice'], color='skyblue')

# Annotates each bar with the total sale price value for better understanding
for bar in bars:
    height = bar.get_height()  # Gets the height of the bar
    x_position = bar.get_x() + bar.get_width() / 2  # Calculates the x position for the annotation
    plt.text(x_position, height, f'{height:,.1f}M', ha='center', va='bottom')  # Adds text annotation

# Sets the title and labels for the plot
plt.title('Neighborhood Collective Market Value')
plt.xlabel('Neighborhoods')
plt.ylabel('Total Sale Price (in millions)')

plt.xticks(rotation=45, ha='right') # Rotates x-axis labels for better readability

plt.grid(axis='y', linestyle='--', alpha=0.8) # Adds a grid to the y-axis 

plt.tight_layout() # Adjusts the layout to prevent clipping of labels

plt.show() # Displays the plot


In [None]:
# Defines the neighborhoods of interest
neighborhoods = ['NridgHt', 'CollgCr', 'NAmes', 'Somerst', 'Gilbert']

# Creates lists to hold the results
counts = []
means = []

# Loops through each neighborhood to calculate count and mean
for neighborhood in neighborhoods:
    housing_data_subset = housing_data[housing_data['Neighborhood'] == neighborhood]
    counts.append(housing_data_subset.shape[0])  # Count of houses
    means.append(round(housing_data_subset['SalePrice'].mean(), 2))  # Mean sale price

# Creates a DataFrame to summarize the results
rank_summary_df = pd.DataFrame({
    'Neighborhood': ['North Ames', 'College Creek', 'Northridge Heights', 'Somerset', 'Gilbert'],
    'Count': counts,
    'Mean Sale Price': means
})

print(rank_summary_df) # Prints the counts and mean sale prices


In [None]:
# Counts the number of houses for each OverallCond level (1 to 10)
overall_cond_counts = [(NAmes_housing['OverallCond'] == i).sum() for i in range(1, 11)]

# Creates a summary DataFrame
count_NAmes_cond = pd.DataFrame({
    'OverallCond': range(1, 11),
    'Number of Houses': overall_cond_counts
})

print(count_NAmes_cond) # Prints the summary DataFrame


In [None]:
# Counts the number of houses for each OverallQual level (1 to 10)
overall_qual_counts = [(NAmes_housing['OverallQual'] == i).sum() for i in range(1, 11)]

# Creates a summary DataFrame
count_NAmes_qual = pd.DataFrame({
    'OverallQual': range(1, 11),
    'Number of Houses': overall_qual_counts
})

print(count_NAmes_qual) # Prints the summary DataFrame


In [None]:
# Creates a new figure for the plot with specified dimensions
plt.figure(figsize=(14, 7))

# Plots the Overall Condition of houses
plt.plot(count_NAmes_cond['OverallCond'], count_NAmes_cond['Number of Houses'], 
         marker='o', color='skyblue', label='Overall Condition')

# Plot the Overall Quality of houses
plt.plot(count_NAmes_qual['OverallQual'], count_NAmes_qual['Number of Houses'], 
         marker='v', color='hotpink', label='Overall Quality')

# Sets the title of the plot
plt.title('Overall Condition and Quality of Houses in the North Ames Neighborhood')

plt.xlabel('Overall Condition/Quality') # Labels the x-axis

plt.ylabel('Number of Houses') # Labels the y-axis

# Sets x-axis ticks to be integers from 1 to 10
plt.xticks(ticks=range(1, 11))

# Adds a grid to the y-axis with dashed lines and a specified transparency (alpha)
plt.grid(axis='y', linestyle='--', alpha=0.8)

# Adds a grid to the x-axis with dashed lines and a specified transparency (alpha)
plt.grid(axis='x', linestyle='--', alpha=0.8)

plt.legend() # Displays the key for better understanding

plt.show() # Shows the plot


In [None]:
# Filters the NAmes_housing DataFrame to get houses with Overall Quality between 5 and 10 
# and Overall Condition of 5 or higher. Create a copy of this filtered DataFrame.
high_rating_houses = NAmes_housing[
    (NAmes_housing['OverallQual'] >= 5) &  
    (NAmes_housing['OverallQual'] <= 10) & 
    (NAmes_housing['OverallCond'] >= 5)    
].copy() # Creates a copy of this filtered DataFrame, this helps avoid an error

# Creates a new column 'Renovated' to indicate if the house has been renovated.
high_rating_houses.loc[:, 'Renovated'] = high_rating_houses['YearRemodAdd'] > high_rating_houses['YearBuilt']

# Displays the resulting DataFrame of high-rating houses with the new 'Renovated' column.
high_rating_houses


In [None]:
# Counts the number of houses in high_rating_houses DataFrame based on the 'Renovated' column.
# This will return a Series with counts of True (renovated) and False (not renovated).
renovation_counts = high_rating_houses['Renovated'].value_counts()

renovation_counts # Displays the counts of renovated and non-renovated houses.


In [None]:
# Groups the high_rating_houses DataFrame by the 'Renovated' column.
# Calculates the mean sale price for each group (renovated and not renovated).
average_prices = high_rating_houses.groupby('Renovated')['SalePrice'].mean()

average_prices # Displays the average sale prices for renovated and non-renovated houses.
