In [6]:
import pandas as pd
df= pd.read_csv('/content/kc_house_data_cleaned.csv')
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,log_sqft_lot,log_basement_lot,rs_age,renovated
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,1,...,0,98178,47.5112,-122.257,1340,5650,8.639588,0.0,69,0
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,1,...,1991,98125,47.721,-122.319,1690,7639,8.887791,5.993961,73,1
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,1,...,0,98028,47.7379,-122.233,2720,8062,9.21044,0.0,91,0
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,1,...,0,98136,47.5208,-122.393,1360,5000,8.517393,6.814543,59,0
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,1,...,0,98074,47.6168,-122.045,1800,7503,8.997271,0.0,37,0


In [4]:
!pip install pandas scikit-learn matplotlib seaborn



In [8]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# Load and inspect data
df = pd.read_csv('/content/kc_house_data_cleaned.csv')
print(df.head())

# Select relevant features for similarity (e.g., 'price', 'bedrooms', 'bathrooms', etc.)
features = df[['price', 'bedrooms', 'bathrooms', 'sqft_living', 'floors', 'sqft_lot']]

# Standardize features for better KNN performance
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Set up the KNN model
knn = NearestNeighbors(n_neighbors=5, algorithm='auto')
knn.fit(scaled_features)

# Function to find similar listings based on an 'id'
def recommend_similar(id, df, knn_model, features, scaler):
    # Check if the id exists in the dataset
    if id not in df['id'].values:
        print(f"ID {id} not found in the dataset.")
        return None

    # Find the index of the item with the given id
    item_index = df.index[df['id'] == id].tolist()[0]
    item_features = features.iloc[item_index]
    scaled_item_features = scaler.transform([item_features])

    # Find the nearest neighbors
    distances, indices = knn_model.kneighbors(scaled_item_features)

    # Exclude the queried item itself
    similar_indices = [i for i in indices.flatten() if i != item_index]

    # Return similar items with distances
    similar_items = df.iloc[similar_indices].copy()
    similar_items['distance'] = distances.flatten()[1:]  # Exclude the first distance (the item itself)

    return similar_items

# Example: Recommend similar items for a specific 'id' in the dataset
id_to_recommend = 123456789  # Replace with a valid ID from your dataset
similar_items = recommend_similar(id=id_to_recommend, df=df, knn_model=knn, features=features, scaler=scaler)

if similar_items is not None:
    print("Similar Items Found:")
    print(similar_items[['id', 'price', 'sqft_living', 'distance']])

    # Scatter Plot
    plt.figure(figsize=(12, 8))
    plt.scatter(similar_items['sqft_living'], similar_items['price'], c='blue', alpha=0.6, label='Similar Listings')

    # Highlight the original item
    original_item = df[df['id'] == id_to_recommend]
    plt.scatter(original_item['sqft_living'], original_item['price'], c='red', s=200, label='Original Listing', edgecolors='black')

    # Annotate the similar items
    for i in range(len(similar_items)):
        plt.annotate(f'Dist: {similar_items.iloc[i]["distance"]:.2f}',
                     (similar_items.iloc[i]['sqft_living'], similar_items.iloc[i]['price']),
                     textcoords="offset points",
                     xytext=(0,10),
                     ha='center')

    plt.title(f'Similar Listings to ID {id_to_recommend}', fontsize=16)
    plt.xlabel('Square Footage (sqft_living)', fontsize=14)
    plt.ylabel('Price ($)', fontsize=14)
    plt.legend()
    plt.grid()
    plt.show()

    # Histogram of Prices
    plt.figure(figsize=(12, 6))
    plt.hist([original_item['price'].values[0]] * 1 + similar_items['price'].values.tolist(),
             bins=20, color=['red'] + ['blue']*len(similar_items),
             label=['Original Listing'] + ['Similar Listings'] * len(similar_items),
             alpha=0.6)
    plt.title('Price Distribution of Original and Similar Listings', fontsize=16)
    plt.xlabel('Price ($)', fontsize=14)
    plt.ylabel('Frequency', fontsize=14)
    plt.legend()
    plt.grid()
    plt.show()

    # Histogram of Square Footage
    plt.figure(figsize=(12, 6))
    plt.hist([original_item['sqft_living'].values[0]] * 1 + similar_items['sqft_living'].values.tolist(),
             bins=20, color=['red'] + ['blue']*len(similar_items),
             label=['Original Listing'] + ['Similar Listings'] * len(similar_items),
             alpha=0.6)
    plt.title('Square Footage Distribution of Original and Similar Listings', fontsize=16)
    plt.xlabel('Square Footage (sqft_living)', fontsize=14)
    plt.ylabel('Frequency', fontsize=14)
    plt.legend()
    plt.grid()
    plt.show()

    # Pair Plot for Original and Similar Items
    combined_items = pd.concat([original_item, similar_items])
    sns.pairplot(combined_items, vars=['price', 'sqft_living', 'bedrooms', 'bathrooms'],
                 hue='id', palette={'original': 'red', 'similar': 'blue'}, markers=["o", "s"])
    plt.title('Pair Plot of Original and Similar Listings', fontsize=16)
    plt.show()

    # Boxplots for Price and Square Footage
    plt.figure(figsize=(12, 6))
    sns.boxplot(data=combined_items[['price', 'sqft_living']], orient='long')
    plt.title('Boxplots of Price and Square Footage', fontsize=16)
    plt.ylabel('Value', fontsize=14)
    plt.xlabel('Features', fontsize=14)
    plt.grid()
    plt.show()

    # Density Plots for Price and Square Footage
    plt.figure(figsize=(12, 6))
    sns.kdeplot(data=combined_items['price'], label='Price', color='blue', fill=True, alpha=0.5)
    sns.kdeplot(data=combined_items['sqft_living'], label='Square Footage', color='red', fill=True, alpha=0.5)
    plt.title('Density Plot of Price and Square Footage', fontsize=16)
    plt.xlabel('Value', fontsize=14)
    plt.ylabel('Density', fontsize=14)
    plt.legend()
    plt.grid()
    plt.show()

    # Correlation Heatmap
    plt.figure(figsize=(10, 8))
    correlation_matrix = combined_items[['price', 'sqft_living', 'bedrooms', 'bathrooms']].corr()
    sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True)
    plt.title('Correlation Heatmap of Features', fontsize=16)
    plt.show()

    # Swarm Plot for Price by Number of Bedrooms
    plt.figure(figsize=(12, 6))
    sns.swarmplot(x='bedrooms', y='price', data=combined_items, color='blue', alpha=0.6)
    plt.title('Price Distribution by Number of Bedrooms', fontsize=16)
    plt.xlabel('Number of Bedrooms', fontsize=14)
    plt.ylabel('Price ($)', fontsize=14)
    plt.grid()
    plt.show()

    # Conclusion Section
    print("Conclusion:")
    print("The KNN model identified the following similar properties based on their features.")
    print(f"The original listing ID {id_to_recommend} has a price of {original_item['price'].values[0]} and a square footage of {original_item['sqft_living'].values[0]} sqft.")
    print("The recommended listings show varying distances from the original, indicating how closely they match in terms of features.")
    print("This analysis can help potential buyers find comparable options in the housing market.")


           id             date     price  bedrooms  bathrooms  sqft_living  \
0  7129300520  20141013T000000  221900.0         3       1.00         1180   
1  6414100192  20141209T000000  538000.0         3       2.25         2570   
2  5631500400  20150225T000000  180000.0         2       1.00          770   
3  2487200875  20141209T000000  604000.0         4       3.00         1960   
4  1954400510  20150218T000000  510000.0         3       2.00         1680   

   sqft_lot  floors  waterfront  view  ...  yr_renovated  zipcode      lat  \
0      5650     1.0           0     1  ...             0    98178  47.5112   
1      7242     2.0           0     1  ...          1991    98125  47.7210   
2     10000     1.0           0     1  ...             0    98028  47.7379   
3      5000     1.0           0     1  ...             0    98136  47.5208   
4      8080     1.0           0     1  ...             0    98074  47.6168   

      long  sqft_living15  sqft_lot15  log_sqft_lot  log_basem