In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler

# Load the cleaned dataset
df = pd.read_csv("cleaned_data_v1.csv")

# Display first few rows
df.head()


Unnamed: 0,User_ID,Age,Gender,Location,Education,Occupation,Primary_App,Secondary_Apps,Usage_Frequency,Daily_Usage_Time,Reason_for_Using,Satisfaction,Challenges,Desired_Features,Preferred_Communication,Partner_Priorities
0,1,20,non-binary,Bangalore,Undergraduate,Freelancer,Hinge,Hinge,Monthly,1 hour,Finding a Partner,4,Safety Concerns,Audio Calls,Video Calls,Values > Personality > Appearance
1,2,24,female,Delhi,Undergraduate,Part-time Job,Hinge,OkCupid,Weekly,30 minutes,Casual Dating,5,Time-Wasting,Video Calls,Text,Values > Personality > Appearance
2,3,24,non-binary,Kolkata,Undergraduate,Intern,Unknown,Unknown,Weekly,2 hours,Casual Dating,4,Safety Concerns,Detailed Profiles,Text,Values > Personality > Appearance
3,4,22,non-binary,Delhi,Graduate,Full-time Job,Unknown,OkCupid,Daily,30 minutes,Casual Fun,3,Unknown,AI Recommendations,Voice Notes,Personality > Interests > Values
4,5,18,male,Delhi,Graduate,Intern,OkCupid,OkCupid,Weekly,2 hours,Casual Fun,4,Safety Concerns,Video Calls,Text,Appearance > Interests > Personality


In [4]:
# Apply One-Hot Encoding to categorical columns
categorical_cols = ["Gender", "Primary_App", "Location", "Reason_for_Using"]
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Display updated dataset
df.head()


Unnamed: 0,User_ID,Age,Education,Occupation,Secondary_Apps,Usage_Frequency,Daily_Usage_Time,Satisfaction,Challenges,Desired_Features,...,Primary_App_Unknown,Location_Chennai,Location_Delhi,Location_Hyderabad,Location_Kolkata,Location_Mumbai,Location_Pune,Reason_for_Using_Casual Fun,Reason_for_Using_Finding a Partner,Reason_for_Using_Social Interaction
0,1,20,Undergraduate,Freelancer,Hinge,Monthly,1 hour,4,Safety Concerns,Audio Calls,...,False,False,False,False,False,False,False,False,True,False
1,2,24,Undergraduate,Part-time Job,OkCupid,Weekly,30 minutes,5,Time-Wasting,Video Calls,...,False,False,True,False,False,False,False,False,False,False
2,3,24,Undergraduate,Intern,Unknown,Weekly,2 hours,4,Safety Concerns,Detailed Profiles,...,True,False,False,False,True,False,False,False,False,False
3,4,22,Graduate,Full-time Job,OkCupid,Daily,30 minutes,3,Unknown,AI Recommendations,...,True,False,True,False,False,False,False,True,False,False
4,5,18,Graduate,Intern,OkCupid,Weekly,2 hours,4,Safety Concerns,Video Calls,...,False,False,True,False,False,False,False,True,False,False


In [6]:
import re

# Function to extract numbers from text (e.g., '1 hour' → 60, '30 mins' → 30)
def convert_to_minutes(value):
    if isinstance(value, str):
        if "hour" in value:
            hours = re.findall(r'\d+', value)
            return int(hours[0]) * 60 if hours else 0  # Convert hours to minutes
        elif "min" in value:
            minutes = re.findall(r'\d+', value)
            return int(minutes[0]) if minutes else 0  # Keep minutes as is
    return value  # If already numeric, return as is

# Apply function to Daily_Usage_Time column
df["Daily_Usage_Time"] = df["Daily_Usage_Time"].apply(convert_to_minutes)

# Convert to integer type
df["Daily_Usage_Time"] = df["Daily_Usage_Time"].astype(int)


In [7]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[["Age", "Daily_Usage_Time"]] = scaler.fit_transform(df[["Age", "Daily_Usage_Time"]])

df.head()


Unnamed: 0,User_ID,Age,Education,Occupation,Secondary_Apps,Usage_Frequency,Daily_Usage_Time,Satisfaction,Challenges,Desired_Features,...,Primary_App_Unknown,Location_Chennai,Location_Delhi,Location_Hyderabad,Location_Kolkata,Location_Mumbai,Location_Pune,Reason_for_Using_Casual Fun,Reason_for_Using_Finding a Partner,Reason_for_Using_Social Interaction
0,1,0.285714,Undergraduate,Freelancer,Hinge,Monthly,0.2,4,Safety Concerns,Audio Calls,...,False,False,False,False,False,False,False,False,True,False
1,2,0.857143,Undergraduate,Part-time Job,OkCupid,Weekly,0.0,5,Time-Wasting,Video Calls,...,False,False,True,False,False,False,False,False,False,False
2,3,0.857143,Undergraduate,Intern,Unknown,Weekly,0.6,4,Safety Concerns,Detailed Profiles,...,True,False,False,False,True,False,False,False,False,False
3,4,0.571429,Graduate,Full-time Job,OkCupid,Daily,0.0,3,Unknown,AI Recommendations,...,True,False,True,False,False,False,False,True,False,False
4,5,0.0,Graduate,Intern,OkCupid,Weekly,0.6,4,Safety Concerns,Video Calls,...,False,False,True,False,False,False,False,True,False,False


In [8]:
df["Active_App_Count"] = df["Secondary_Apps"].apply(lambda x: len(x.split(",")) if x != "None" else 1)

# Display updated dataset
df.head()


Unnamed: 0,User_ID,Age,Education,Occupation,Secondary_Apps,Usage_Frequency,Daily_Usage_Time,Satisfaction,Challenges,Desired_Features,...,Location_Chennai,Location_Delhi,Location_Hyderabad,Location_Kolkata,Location_Mumbai,Location_Pune,Reason_for_Using_Casual Fun,Reason_for_Using_Finding a Partner,Reason_for_Using_Social Interaction,Active_App_Count
0,1,0.285714,Undergraduate,Freelancer,Hinge,Monthly,0.2,4,Safety Concerns,Audio Calls,...,False,False,False,False,False,False,False,True,False,1
1,2,0.857143,Undergraduate,Part-time Job,OkCupid,Weekly,0.0,5,Time-Wasting,Video Calls,...,False,True,False,False,False,False,False,False,False,1
2,3,0.857143,Undergraduate,Intern,Unknown,Weekly,0.6,4,Safety Concerns,Detailed Profiles,...,False,False,False,True,False,False,False,False,False,1
3,4,0.571429,Graduate,Full-time Job,OkCupid,Daily,0.0,3,Unknown,AI Recommendations,...,False,True,False,False,False,False,True,False,False,1
4,5,0.0,Graduate,Intern,OkCupid,Weekly,0.6,4,Safety Concerns,Video Calls,...,False,True,False,False,False,False,True,False,False,1


In [9]:
df.to_csv("feature_engineered_data.csv", index=False)
