In [1]:
import pandas as pd
import numpy as np
import pickle
import sqlite3
import os
from tqdm import tqdm
from urllib.parse import urlparse
import random
import hashlib
import time

In [2]:
EN_MSMARCO_DATA_PATH = "/Users/cgopal/Downloads/web_data/msmarco_english.csv"

In [3]:
english_data = pd.read_csv(EN_MSMARCO_DATA_PATH, header=None)
english_data.columns = ['docid', 'url', 'title', 'body']
english_data['lang'] = 'en'
english_data['topic'] = ''
english_data = english_data.loc[~english_data['title'].isna()].reset_index(drop=True)
english_data['description'] = english_data['body'].fillna('').apply(lambda body: body[:300])
english_data = english_data.loc[english_data['title'].apply(len) <= 100].reset_index(drop=True)
print(len(english_data))
english_data

487041


Unnamed: 0,docid,url,title,body,lang,topic,description
0,D1555982,https://answers.yahoo.com/question/index?qid=2...,The hot glowing surfaces of stars emit energy ...,Science & Mathematics Physics The hot glowing ...,en,,Science & Mathematics Physics The hot glowing ...
1,D301595,http://childparenting.about.com/od/physicalemo...,Developmental Milestones and Your 8-Year-Old C...,School-Age Kids Growth & Development Developme...,en,,School-Age Kids Growth & Development Developme...
2,D1359209,http://visihow.com/Check_for_Lice_Nits,Check for Lice Nits,Check for Lice Nits Edited by Mian Sheilette O...,en,,Check for Lice Nits Edited by Mian Sheilette O...
3,D2147834,http://www.nytimes.com/2010/01/05/business/glo...,Dubai Opens a Tower to Beat All,Global Business Dubai Opens a Tower to Beat Al...,en,,Global Business Dubai Opens a Tower to Beat Al...
4,D1568809,http://www.realtor.com/realestateandhomes-sear...,"Coulterville, CA Real Estate & Homes for Sale","Coulterville, CA Real Estate & Homes for Sale4...",en,,"Coulterville, CA Real Estate & Homes for Sale4..."
...,...,...,...,...,...,...,...
487036,D481294,http://biology.about.com/od/organsystems/ss/ly...,Lymphatic System Components,"Science, Tech, Math ›Science Lymphatic System ...",en,,"Science, Tech, Math ›Science Lymphatic System ..."
487037,D2920233,http://www.freshplaza.com/article/154127/Pomeg...,"Pomegranates: increasing production, consumpti...","Pomegranates: increasing production, consumpti...",en,,"Pomegranates: increasing production, consumpti..."
487038,D1616157,http://www.airpassengerrights.eu/en/denied-boa...,Denied boarding,Denied boarding3 1 1 1 1 1 1 1 1 1 1 Rating 3....,en,,Denied boarding3 1 1 1 1 1 1 1 1 1 1 Rating 3....
487039,D1363414,http://www.augustaga.gov/657/Solid-Waste,Environmental Services,Home Departments A - E Environmental Services ...,en,,Home Departments A - E Environmental Services ...


In [4]:
# (english_data['description'] != '').sum()

In [5]:
def generate_url_hash(url):
    hash_object = hashlib.md5(url.encode("utf-8"))  
    url_hash = int(hash_object.hexdigest(), 16) % (10 ** 14) 
    return url_hash


def generate_last_visit_date(index):
    base_time = int(time.time() * 1_000_000)  # Current time in microseconds
    return base_time + index * 2_000  # Increment by 2000 microseconds (2ms) per row

In [6]:
num_samples = 50000
res_df = english_data.sample(num_samples).reset_index(drop=True)

frecency = [random.randint(10,500) for _ in range(num_samples)]
res_df['frecency'] = frecency
res_df['url_hash'] = res_df['url'].apply(generate_url_hash)
res_df["last_visit_date"] = [generate_last_visit_date(i) for i in range(len(res_df))]
profiles_generated = [res_df]

In [7]:
# with open("../data/profiles_generated.pkl", "rb") as f:
#     profiles_generated = pickle.load(f)

In [8]:
# len(profiles_generated)

In [9]:
profiles_generated[0]

Unnamed: 0,docid,url,title,body,lang,topic,description,frecency,url_hash,last_visit_date
0,D2559059,http://www.alzheimersreadingroom.com/2013/09/w...,"When Dementia Patients Say Mean Things, What C...","When Dementia Patients Say Mean Things, What C...",en,,"When Dementia Patients Say Mean Things, What C...",287,6485612302872,1739765225998667
1,D336642,https://www.concordcoalition.org/blog-post/its...,Itâs Important to Distinguish Between Short-Te...,A Series25 Fiscal Lessons Learned over the cou...,en,,A Series25 Fiscal Lessons Learned over the cou...,422,3546477437418,1739765226000669
2,D1764939,"https://en.wikipedia.org/wiki/Flaxton,_North_Y...","Flaxton, North Yorkshire",Wikimedia is looking for female coders! Would ...,en,,Wikimedia is looking for female coders! Would ...,187,90043871859844,1739765226002669
3,D362010,http://www.answers.com/Q/What_is_the_largest_m...,What is the largest mammal on earth?,Answers.com ® Wiki Answers ® Categories Histor...,en,,Answers.com ® Wiki Answers ® Categories Histor...,207,61802397218694,1739765226004669
4,D1470315,https://en.wikipedia.org/wiki/Amazon_Kindle_Fire,Kindle Fire,"From Wikipedia, the free encyclopedia (Redirec...",en,,"From Wikipedia, the free encyclopedia (Redirec...",349,17056276657690,1739765226006669
...,...,...,...,...,...,...,...,...,...,...
49995,D2345587,https://www.amazon.com/Down-River-Mac-McAnally...,Down By The River,CDs & Vinyl › Country › Americana Share Go Unl...,en,,CDs & Vinyl › Country › Americana Share Go Unl...,80,72320718732027,1739765325995515
49996,D889,http://www.answers.com/Q/What_is_the_largest_o...,What is the largest organ of the body?,Answers.com ® Wiki Answers ® Categories Scienc...,en,,Answers.com ® Wiki Answers ® Categories Scienc...,252,58044427806276,1739765325997515
49997,D1050322,https://www.durascreens.com/,The toughest retractable screens on the market!,Your browser does not currently recognize any ...,en,,Your browser does not currently recognize any ...,242,27439745437962,1739765325999515
49998,D1648755,http://www.hrbenefitsalert.com/drop-spouses-he...,Drop spouses from health plan or add a surchar...,Healthcare Drop spouses from health plan or ad...,en,,Healthcare Drop spouses from health plan or ad...,140,29314584118800,1739765326001515


In [10]:
def prepare_synthetic_profile(profiles_generated):
    len(profiles_generated)
    dfs = []
    for profile in profiles_generated:
        dfs.append(profile)
    
    profile_data = pd.concat(dfs, axis=0).drop_duplicates(['url'], keep='first').reset_index(drop=True)
    profile_data = profile_data.loc[~profile_data['url'].isna()].reset_index(drop=True)
    profile_data = profile_data.drop_duplicates(['url_hash']).reset_index(drop=True)
    # print(len(profile_data))
    profile_data['domain'] = profile_data['url'].apply(lambda x: urlparse(x).netloc.split(':')[0])
    profile_data['combined_text'] = profile_data['title'] + " " +   profile_data['description'].fillna("")
    return profile_data

def get_connection_to_places():
    return sqlite3.connect("../data/places.sqlite")

def get_personal_history(row_limit, sql_conn):
    input_data = pd.read_sql(f"""
        SELECT *
        FROM moz_places p
        WHERE p.title NOTNULL
        AND url not like '%google.com/search?%'
        ORDER BY frecency DESC
        LIMIT {row_limit}; 
        """, sql_conn)
    
    # print(len(input_data))
    return input_data


def prepare_dummy_profile(input_data, samples_df):
    input_data['url'] = samples_df['url'].values.tolist()
    input_data['title'] = samples_df['title'].values.tolist()
    input_data['url_hash'] = samples_df['url_hash'].values.tolist()
    # Fill ~75% of description as null based on some eda
    input_data['description'] = samples_df['description'].apply(
        lambda desc: None if np.random.choice([True, False], p=[0.75, 0.25]) else desc
    ).values.tolist()
    input_data['rev_host'] = samples_df['domain'].apply(lambda host: host[::-1]).values.tolist()
    input_data['preview_image_url'] = None
    return input_data

In [11]:

def generate_dummy_profile(row_limit):
    sql_conn = get_connection_to_places()
    input_data = get_personal_history(row_limit, sql_conn)
    profile_data = prepare_synthetic_profile(profiles_generated)
    samples_df = profile_data.sample(row_limit).reset_index(drop=True)
    # print(len(samples_df))
    return prepare_dummy_profile(input_data, samples_df)

In [12]:
def prepare_sqlite_db(df, places_db_path):
    # Connect to SQLite database
    conn = sqlite3.connect(places_db_path)
    
    # Write DataFrame to SQLite database
    df.to_sql("moz_places", conn, if_exists="replace", index=False)  # Modify table name as needed
    
    # Close connection
    conn.close()

In [13]:
os.makedirs("../data/profiles/dbs", exist_ok=True)

# for row_limit in [10, 3000, 10000]: #, 15000, 20000]:
for row_limit in [num_samples]: #, 15000, 20000]:
    profile = generate_dummy_profile(row_limit)
    print(len(profile))
    prepare_sqlite_db(profile, f"../data/profiles/dbs/places_{row_limit}.sqlite")

50000


In [14]:
# profile['description'].isna().sum()

In [15]:
# profile.to_json("profile_10000.json", orient="records", indent=2)
# profile.to_json("profile_20000.json", orient="records", indent=2)
profile.to_json("profile_50000.json", orient="records", indent=2)

In [16]:
profile

Unnamed: 0,id,url,title,rev_host,visit_count,hidden,typed,frecency,last_visit_date,guid,foreign_count,url_hash,description,preview_image_url,site_name,origin_id,recalc_frecency,alt_frecency,recalc_alt_frecency
0,34,https://www.pharmacy.texas.gov/consumer/broch4...,Prescriptions,vog.saxet.ycamrahp.www,5077,0,1,1421560,1736196807185413,64gFsanEGd8x,0,98401416987290,,,,10,0,,1
1,27217,https://www.quora.com/What-should-I-pay-per-ho...,What should I pay per hour for an intern who i...,moc.arouq.www,313,0,1,281700,1736189753947959,vZxpojuFe932,0,77994502836526,,,,1813,0,,1
2,41486,http://www.answers.com/Q/What_is_the_meaning_o...,What is the meaning of limnic eruptions?,moc.srewsna.www,379,0,1,274017,1736176190746123,kYNCG6zGvvrs,0,50240844195400,,,,16,0,,1
3,89772,http://www.wikihow.com/Correct-Forward-Head-Po...,How to Correct Forward Head Posture,moc.wohikiw.www,199,0,1,248551,1736176755662276,U8IfBJ2WRfou,0,33318718073047,,,,4429,0,,1
4,1400,http://www.del-mar-guide.com/,Welcome to Del Mar!,moc.ediug-ram-led.www,452,0,1,236911,1735422684696590,Y2kdcw-k0cz7,0,8269018918039,,,,73,0,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,42594,http://www.wareable.com/vr/htc-vive-vr-headset...,HTC Vive review,moc.elbaeraw.www,1,0,0,20,1694285713984166,oGsWGRj3sqiW,0,36555627034839,HTC Vive review HTC’s headset shows us just wh...,,,2860,0,,1
49996,42593,https://answers.yahoo.com/question/index?qid=2...,Which color is positive and which is negative ...,moc.oohay.srewsna,1,0,0,20,1694285700940371,zwTExqjxBIJo,0,34940048325300,Cars & Transportation Maintenance & Repairs Wh...,,,2860,0,,1
49997,42592,http://www.pickles-and-spices.com/types-of-vin...,Types of Vinegar - which are good for cooking ...,moc.secips-dna-selkcip.www,1,0,0,20,1694285691931865,nBdl7kNIpnJU,0,38649068158336,Follow @Nazlina Types of Vinegar - which are g...,,,2860,0,,1
49998,42591,http://www.cracked.com/article_19303_6-reasons...,6 Reasons Professional Poker Is Way Harder Tha...,moc.dekcarc.www,1,0,0,20,1694285676220461,ujftTTMJnB9o,0,19450394137032,,,,2860,0,,1


In [17]:
# profile.drop_duplicates(['url_hash']).reset_index(drop=True)

In [18]:
profile['url'].nunique()

50000

In [19]:
profile['url'].isna().sum()

np.int64(0)

In [20]:
profile['url_hash'].nunique()

50000

In [21]:
profile['url_hash'].value_counts()

url_hash
98401416987290    1
9089697846325     1
9210841145943     1
6019927410306     1
12907093726445    1
                 ..
43864775691024    1
5200323214879     1
37548005763151    1
79593519871766    1
19524985471870    1
Name: count, Length: 50000, dtype: int64

In [22]:
profile['description'].str.len().mean()

np.float64(295.9732607477384)

In [23]:
profile['title'].str.len().describe(percentiles=[.25, .5, .75, .9, .95, .98, .99, .995, .998, .999])

count    50000.000000
mean        30.754360
std         21.122991
min          1.000000
25%         14.000000
50%         28.000000
75%         44.000000
90%         60.000000
95%         70.000000
98%         82.000000
99%         89.000000
99.5%       94.000000
99.8%       97.000000
99.9%       99.000000
max        100.000000
Name: title, dtype: float64