In [71]:
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim

In [72]:
df = pd.read_csv("okcupid_profiles.csv")

In [73]:
df.head(2)

Unnamed: 0,age,status,sex,orientation,body_type,diet,drinks,drugs,education,ethnicity,...,essay0,essay1,essay2,essay3,essay4,essay5,essay6,essay7,essay8,essay9
0,22,single,m,straight,a little extra,strictly anything,socially,never,working on college/university,"asian, white",...,about me: i would love to think that i was so...,currently working as an international agent fo...,making people laugh. ranting about a good salt...,"the way i look. i am a six foot half asian, ha...","books: absurdistan, the republic, of mice and ...",food. water. cell phone. shelter.,duality and humorous things,trying to find someone to hang out with. i am ...,i am new to california and looking for someone...,you want to be swept off your feet! you are ti...
1,35,single,m,straight,average,mostly other,often,sometimes,working on space camp,white,...,i am a chef: this is what that means. 1. i am ...,dedicating everyday to being an unbelievable b...,being silly. having ridiculous amonts of fun w...,,i am die hard christopher moore fan. i don't r...,delicious porkness in all of its glories. my b...,,,i am very open and will share just about anyth...,


Formating and Cleaning Data

In [74]:
df.isna().sum()

age                0
status             0
sex                0
orientation        0
body_type       5296
diet           24395
drinks          2985
drugs          14080
education       6628
ethnicity       5680
height             3
income             0
job             8198
last_online        0
location           0
offspring      35561
pets           19921
religion       20226
sign           11056
smokes          5512
speaks            50
essay0          5488
essay1          7572
essay2          9638
essay3         11476
essay4         10537
essay5         10850
essay6         13771
essay7         12451
essay8         19225
essay9         12603
dtype: int64

In [75]:
# Renaming columns
columns = ["body_type", "diet", "education", "drinks", "drugs","ethnicity", "job","offspring","pets","religion", "sign", "smokes", "speaks"]

# Filling NaN values
df[columns] = df[columns].fillna("No info")
df["height"] = df["height"].fillna(0)
df["income"] = df["income"].replace(-1, 0)

# Creating a new column by combining essays into one profile text
df["profile_text"] = (df["essay0"].str.cat(df.iloc[:,22:-1].astype(str), sep=" "))

# Deleting unnecessary essays columns, as now I have all of them together in the profile_text column
df = df.drop(df.iloc[:,21:-1], axis=1)
df["profile_text"]=df["profile_text"].fillna("No info")

# Creating a new column by counting the words in the profile_text column
df["total_words"] = df["profile_text"].str.split().str.len()

# Converting the height column from inches to cm
df['height'] = (df['height'] * 2.54).astype(int)

df = df.drop_duplicates()

df.head(2)

Unnamed: 0,age,status,sex,orientation,body_type,diet,drinks,drugs,education,ethnicity,...,last_online,location,offspring,pets,religion,sign,smokes,speaks,profile_text,total_words
0,22,single,m,straight,a little extra,strictly anything,socially,never,working on college/university,"asian, white",...,2012-06-28-20-30,"south san francisco, california","doesn't have kids, but might want them",likes dogs and likes cats,agnosticism and very serious about it,gemini,sometimes,english,about me: i would love to think that i was so...,420
1,35,single,m,straight,average,mostly other,often,sometimes,working on space camp,white,...,2012-06-29-21-41,"oakland, california","doesn't have kids, but might want them",likes dogs and likes cats,agnosticism but not too serious about it,cancer,no,"english (fluently), spanish (poorly), french (...",i am a chef: this is what that means. 1. i am ...,271


In [76]:
geolocator = Nominatim(user_agent="geoapiExercises")

# create a dictionary to store the latitude and longitude of each city
city_coordinates = {}
for city in unique_cities:
    location = geolocator.geocode(city)
    if location is not None:
        city_coordinates[city] = (location.latitude, location.longitude)
    else:
        pass

# create two new columns in the DataFrame based on the latitude and longitude of each city
df["latitude"] = df["location"].map(lambda x: city_coordinates.get(x, [None, None])[0])
df["longitude"] = df["location"].map(lambda x: city_coordinates.get(x, [None, None])[1])

df = df.dropna()

df.head(2)

Unnamed: 0,age,status,sex,orientation,body_type,diet,drinks,drugs,education,ethnicity,...,offspring,pets,religion,sign,smokes,speaks,profile_text,total_words,latitude,longitude
0,22,single,m,straight,a little extra,strictly anything,socially,never,working on college/university,"asian, white",...,"doesn't have kids, but might want them",likes dogs and likes cats,agnosticism and very serious about it,gemini,sometimes,english,about me: i would love to think that i was so...,420,37.65354,-122.416866
1,35,single,m,straight,average,mostly other,often,sometimes,working on space camp,white,...,"doesn't have kids, but might want them",likes dogs and likes cats,agnosticism but not too serious about it,cancer,no,"english (fluently), spanish (poorly), french (...",i am a chef: this is what that means. 1. i am ...,271,37.804456,-122.271356


In [77]:
# Checking again
df.isna().sum()

age             0
status          0
sex             0
orientation     0
body_type       0
diet            0
drinks          0
drugs           0
education       0
ethnicity       0
height          0
income          0
job             0
last_online     0
location        0
offspring       0
pets            0
religion        0
sign            0
smokes          0
speaks          0
profile_text    0
total_words     0
latitude        0
longitude       0
dtype: int64

In [78]:
df.head(2)

Unnamed: 0,age,status,sex,orientation,body_type,diet,drinks,drugs,education,ethnicity,...,offspring,pets,religion,sign,smokes,speaks,profile_text,total_words,latitude,longitude
0,22,single,m,straight,a little extra,strictly anything,socially,never,working on college/university,"asian, white",...,"doesn't have kids, but might want them",likes dogs and likes cats,agnosticism and very serious about it,gemini,sometimes,english,about me: i would love to think that i was so...,420,37.65354,-122.416866
1,35,single,m,straight,average,mostly other,often,sometimes,working on space camp,white,...,"doesn't have kids, but might want them",likes dogs and likes cats,agnosticism but not too serious about it,cancer,no,"english (fluently), spanish (poorly), french (...",i am a chef: this is what that means. 1. i am ...,271,37.804456,-122.271356


General Statistics and Data Types

In [79]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,59859.0,32.326684,9.436611,18.0,26.0,30.0,37.0,110.0
height,59859.0,172.958619,10.147321,0.0,167.0,172.0,180.0,241.0
income,59859.0,20021.884763,97320.74117,0.0,0.0,0.0,0.0,1000000.0
total_words,59859.0,323.60564,275.010325,2.0,138.0,274.0,445.0,9511.0
latitude,59859.0,37.772212,0.329344,12.243169,37.779026,37.779026,37.804456,55.953346
longitude,59859.0,-122.276241,2.203446,-157.855676,-122.419906,-122.419906,-122.271356,109.189868


In [80]:
df.describe(include = "object").T

Unnamed: 0,count,unique,top,freq
status,59859,5,single,55613
sex,59859,2,m,35791
orientation,59859,3,straight,51527
body_type,59859,13,average,14626
diet,59859,19,No info,24362
drinks,59859,7,socially,41716
drugs,59859,4,never,37660
education,59859,33,graduated from college/university,23919
ethnicity,59859,218,white,32767
job,59859,22,No info,8193


In [81]:
# On average, the age of profile's is 32 years old, with the income around 20.000$
# Average number of words in each biography is 323

In [82]:
df.to_csv('cleaned.csv', index=False)