In [1]:
import pandas as pd

# Load the cleaned dataset
df = pd.read_csv("cleaned_data_v1.csv")

# Display the first few rows
df.head()


Unnamed: 0,User_ID,Age,Gender,Location,Education,Occupation,Primary_App,Secondary_Apps,Usage_Frequency,Daily_Usage_Time,Reason_for_Using,Satisfaction,Challenges,Desired_Features,Preferred_Communication,Partner_Priorities
0,1,20,non-binary,Bangalore,Undergraduate,Freelancer,Hinge,Hinge,Monthly,1 hour,Finding a Partner,4,Safety Concerns,Audio Calls,Video Calls,Values > Personality > Appearance
1,2,24,female,Delhi,Undergraduate,Part-time Job,Hinge,OkCupid,Weekly,30 minutes,Casual Dating,5,Time-Wasting,Video Calls,Text,Values > Personality > Appearance
2,3,24,non-binary,Kolkata,Undergraduate,Intern,Unknown,Unknown,Weekly,2 hours,Casual Dating,4,Safety Concerns,Detailed Profiles,Text,Values > Personality > Appearance
3,4,22,non-binary,Delhi,Graduate,Full-time Job,Unknown,OkCupid,Daily,30 minutes,Casual Fun,3,Unknown,AI Recommendations,Voice Notes,Personality > Interests > Values
4,5,18,male,Delhi,Graduate,Intern,OkCupid,OkCupid,Weekly,2 hours,Casual Fun,4,Safety Concerns,Video Calls,Text,Appearance > Interests > Personality


In [2]:
# Get dataset information
df.info()

# Check missing values
df.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   User_ID                  500 non-null    int64 
 1   Age                      500 non-null    int64 
 2   Gender                   500 non-null    object
 3   Location                 500 non-null    object
 4   Education                500 non-null    object
 5   Occupation               500 non-null    object
 6   Primary_App              500 non-null    object
 7   Secondary_Apps           500 non-null    object
 8   Usage_Frequency          500 non-null    object
 9   Daily_Usage_Time         500 non-null    object
 10  Reason_for_Using         500 non-null    object
 11  Satisfaction             500 non-null    int64 
 12  Challenges               500 non-null    object
 13  Desired_Features         500 non-null    object
 14  Preferred_Communication  500 non-null    o

User_ID                    0
Age                        0
Gender                     0
Location                   0
Education                  0
Occupation                 0
Primary_App                0
Secondary_Apps             0
Usage_Frequency            0
Daily_Usage_Time           0
Reason_for_Using           0
Satisfaction               0
Challenges                 0
Desired_Features           0
Preferred_Communication    0
Partner_Priorities         0
dtype: int64

In [3]:
# Define column descriptions
data_dict = {
    "User_ID": "Unique identifier for each user",
    "Age": "User's age in years",
    "Gender": "User's gender identity",
    "Location": "City or region where the user is based",
    "Education": "User's highest level of education",
    "Occupation": "Current employment status",
    "Primary_App": "Main dating app used",
    "Secondary_Apps": "Other dating apps the user uses",
    "Usage_Frequency": "How often the user engages with dating apps",
    "Daily_Usage_Time": "Time spent daily on dating apps",
    "Reason_for_Using": "User's reason for using dating apps",
    "Satisfaction": "User's satisfaction level (scale of 1-5)",
    "Challenges": "Issues faced while using dating apps",
    "Desired_Features": "Features users want in dating apps",
    "Preferred_Communication": "Preferred mode of communication",
    "Partner_Priorities": "Key traits users look for in a partner"
}

# Convert dictionary into a DataFrame
data_dict_df = pd.DataFrame(list(data_dict.items()), columns=["Column Name", "Description"])

# Save as a CSV file
data_dict_df.to_csv("data_dictionary.csv", index=False)

# Display data dictionary
data_dict_df


Unnamed: 0,Column Name,Description
0,User_ID,Unique identifier for each user
1,Age,User's age in years
2,Gender,User's gender identity
3,Location,City or region where the user is based
4,Education,User's highest level of education
5,Occupation,Current employment status
6,Primary_App,Main dating app used
7,Secondary_Apps,Other dating apps the user uses
8,Usage_Frequency,How often the user engages with dating apps
9,Daily_Usage_Time,Time spent daily on dating apps


In [5]:
readme_content = """
# GenZ Dating App Dataset - Documentation

## 📌 Dataset Overview
This dataset contains information about GenZ users and their behavior on dating apps.

## 📂 Files Included
- `cleaned_data_v1.csv` - The cleaned dataset
- `data_dictionary.csv` - Column descriptions
- `dataset_documentation.ipynb` - This notebook

## 📊 Columns & Descriptions
| Column Name | Description |
|-------------|-------------|
""" + data_dict_df.to_string(index=False)

# Save README file
# Save README file with UTF-8 encoding
with open("README.md", "w", encoding="utf-8") as file:
    file.write(readme_content)

print("README.md file created successfully!")



README.md file created successfully!
