# Dating App EDA

## Takeaways
<ul>
    <li>Data is clean. No duplicates</li>
</ul>

In [63]:
import pandas as pd
import numpy as np
import random as rd
import seaborn as sn
from datetime import datetime, date

In [32]:
# Import Data
users = pd.read_csv("Data/users.csv")
messages = pd.read_csv("Data/messages.csv")
reports = pd.read_csv("Data/reports.csv")

## Quality Check

In [33]:
# Look at data

print(f"Users:\n{users.head()}\n---\nMessages:\n{messages.head()}\n---\nReports:\n{reports.head()}")

Users:
   user_id  start_date  is_verified     gender   region device_os  \
0     1001  2023-01-31            0          F   Canada       iOS   
1     1002  2023-12-30            0  Nonbinary  Nigeria   Android   
2     1003  2022-05-10            0          M       US   Android   
3     1004  2023-07-18            0          M   Canada       iOS   
4     1005  2023-02-04            1          M       US   Android   

  platform_package  
0             base  
1             base  
2             base  
3             base  
4             gold  
---
Messages:
   message_id  sender_id  receiver_id            timestamp  \
0       50001       1283         1019  2025-01-02 07:17:00   
1       50002       1136         1471  2025-01-18 13:58:00   
2       50003       1467         1311  2025-01-05 21:33:00   
3       50004       1380         1358  2025-01-27 07:29:00   
4       50005       1362         1374  2025-01-30 08:44:00   

                                message_text  
0               Ca

### Users

In [7]:
users.isna().sum()

user_id             0
account_age_days    0
is_verified         0
gender              0
region              0
device_os           0
dtype: int64

In [10]:
users.describe()

Unnamed: 0,user_id,account_age_days,is_verified
count,500.0,500.0,500.0
mean,1250.5,1031.674,0.288
std,144.481833,570.153346,0.453285
min,1001.0,1.0,0.0
25%,1125.75,554.5,0.0
50%,1250.5,1059.5,0.0
75%,1375.25,1509.25,1.0
max,1500.0,1998.0,1.0


In [14]:
users.user_id.nunique()

500

### Messages

In [8]:
messages.isna().sum()

message_id      0
sender_id       0
receiver_id     0
timestamp       0
message_text    0
dtype: int64

In [11]:
messages.describe()

Unnamed: 0,message_id,sender_id,receiver_id
count,1000.0,1000.0,1000.0
mean,50500.5,1255.859,1243.488
std,288.819436,144.034409,146.413978
min,50001.0,1001.0,1001.0
25%,50250.75,1128.0,1113.0
50%,50500.5,1262.0,1237.0
75%,50750.25,1383.0,1371.25
max,51000.0,1500.0,1500.0


In [16]:
print(messages.sender_id.nunique(), messages.receiver_id.nunique(), messages.message_id.nunique())

429 434 1000


### Reports

In [9]:
reports.isna().sum()

report_id           0
reporter_id         0
reported_user_id    0
category            0
timestamp           0
dtype: int64

In [12]:
reports.describe

<bound method NDFrame.describe of     report_id  reporter_id  reported_user_id        category  \
0        9001         1403              1202      catfishing   
1        9002         1404              1212  financial_scam   
2        9003         1100              1006            spam   
3        9004         1163              1112      harassment   
4        9005         1416              1376            spam   
..        ...          ...               ...             ...   
95       9096         1424              1121      harassment   
96       9097         1378              1298      catfishing   
97       9098         1096              1466      catfishing   
98       9099         1457              1054      harassment   
99       9100         1229              1331  financial_scam   

              timestamp  
0   2025-01-05 20:24:00  
1   2025-01-15 01:28:00  
2   2025-01-01 19:07:00  
3   2025-01-03 18:41:00  
4   2025-01-25 19:40:00  
..                  ...  
95  2025-01-17 

In [75]:
reports['timestamp_dt'] = pd.to_datetime(reports['timestamp'])

## Join Data

In [35]:
base = pd.merge(users, reports, left_on='user_id', right_on='reported_user_id', how='left')
base.head()

Unnamed: 0,user_id,start_date,is_verified,gender,region,device_os,platform_package,report_id,reporter_id,reported_user_id,category,timestamp
0,1001,2023-01-31,0,F,Canada,iOS,base,,,,,
1,1002,2023-12-30,0,Nonbinary,Nigeria,Android,base,,,,,
2,1003,2022-05-10,0,M,US,Android,base,,,,,
3,1004,2023-07-18,0,M,Canada,iOS,base,,,,,
4,1005,2023-02-04,1,M,US,Android,gold,,,,,


In [77]:
base['age_on_report'] = pd.to_datetime(base[base.report_id.notna()]['timestamp']) - pd.to_datetime(base[base.report_id.notna()]['start_date'])  

In [69]:
base['age_on_report'] = datetime.strptime(base[base.timestamp.notna()].timestamp, '%Y-%m-%d').date() - datetime.strptime(base[base.timestamp.notna()].start_date[7], '%Y-%m-%d').date()
base.head()

TypeError: strptime() argument 1 must be str, not Series

In [37]:
base[base.report_id.notna()].head()

Unnamed: 0,user_id,start_date,is_verified,gender,region,device_os,platform_package,report_id,reporter_id,reported_user_id,category,timestamp
7,1008,2022-11-10,0,Nonbinary,India,iOS,base,9095.0,1240.0,1008.0,catfishing,2025-02-15
12,1013,2020-11-26,1,F,Canada,iOS,silver,9021.0,1072.0,1013.0,spam,2024-07-20
23,1024,2020-12-09,1,F,Canada,Android,silver,9078.0,1397.0,1024.0,harassment,2025-01-16
27,1028,2021-01-20,1,M,US,iOS,gold,9065.0,1212.0,1028.0,financial_scam,2024-08-19
28,1029,2023-04-30,0,M,Philippines,Android,base,9052.0,1326.0,1029.0,financial_scam,2024-07-03


In [36]:
base.describe()

Unnamed: 0,user_id,is_verified,report_id,reporter_id,reported_user_id
count,510.0,510.0,100.0,100.0,100.0
mean,1250.32549,0.294118,9050.5,1246.35,1266.85
std,144.383779,0.456092,29.011492,143.166949,146.266007
min,1001.0,0.0,9001.0,1008.0,1008.0
25%,1126.25,0.0,9025.75,1114.25,1145.5
50%,1249.5,0.0,9050.5,1218.5,1257.5
75%,1374.75,1.0,9075.25,1369.5,1394.75
max,1500.0,1.0,9100.0,1500.0,1499.0


In [34]:
base.groupby(base[base.report_id.notna()]['gender'])['account_age_days'].agg(['mean', 'median', 'std', 'sum'])

Unnamed: 0_level_0,mean,median,std,sum
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
F,1060.061224,1060.0,577.117168,51943
M,1121.452381,1168.5,559.778057,47101
Nonbinary,818.333333,751.0,693.354166,7365


In [31]:
base.groupby(base['gender'])['account_age_days'].agg(['min', 'max', 'mean', 'median', 'std', 'sum'])

Unnamed: 0_level_0,min,max,mean,median,std,sum
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
F,1,1981,1046.308943,1104.0,570.523671,257392
M,1,1998,1034.014354,1038.0,573.580702,216109
Nonbinary,16,1981,966.557692,954.5,558.867006,50261
