In [3]:
import sqlite3
import pandas as pd

In [4]:
# read csv table
inpath = '../../../pappa/data/gender_classification/user_age_gender.csv'
df = pd.read_csv(inpath, sep=';', index_col=0)
# count number of rows and of unique user_ids
print('Number of rows:', df.shape[0])
print('Number of unique user_ids:', len(df.user_id.unique()))
# retrieve duplicate user_ids
duplicate_user_ids = df[df.duplicated(subset=['user_id'], keep=False)].user_id.unique()
print('Number of duplicate user_ids:', len(duplicate_user_ids))
duplicate_user_ids
# retrieve empty user_ids
empty_user_ids = df[df.user_id.isna()].index
print('Number of empty user_ids:', len(empty_user_ids))

Number of rows: 15773
Number of unique user_ids: 15764
Number of duplicate user_ids: 2
Number of empty user_ids: 2


In [134]:
def retrieve_users_tweets(
        cursor:sqlite3.Cursor,
        table:str,
        user_id:list,
        max_tweets:int=100,
        tweets_features:list=['tweet_id', 'created_at', 'text', 'retweet_text',],
        ):
    
    query = f"""
        SELECT *
        FROM {table}
        WHERE user_id == {user_id}
        LIMIT {max_tweets};
        """
    
    # Execute the query and fetch the results
    cursor.execute(query)
    tweets = cursor.fetchall()

    return tweets

# Example usage
database_path = '../mydata/database/myMENTALISM.db'
table_name = 'sample_tweets'
user_ids_table = '../../../pappa/data/user_classification/user_age_gender_location.pkl'
user_ids_to_retrieve = pd.read_pickle(user_ids_table).user_id[:100].astype(int).tolist()
#user_ids_to_retrieve = ['842452578738806784', '69150122']
max_tweets_to_retrieve = 100000
tweets_features_to_retrieve = ['tweet_id', 'created_at', 'text', 'retweet_text']

# Connect to the database
conn = sqlite3.connect(database_path)
cursor = conn.cursor()

result = {}
# loop through batches of user_ids
for u in user_ids_to_retrieve:
    # retrieve tweets
    tweets = retrieve_users_tweets(cursor, table_name, u, max_tweets_to_retrieve, tweets_features_to_retrieve)
    # save the tweets in a dictionary
    result[u] = tweets

# Close the database connection
conn.close()

In [135]:
# count all tweets retrieved
total_tweets = 0
for k in result.keys():
    total_tweets += len(result[k])
print('Total tweets retrieved:', total_tweets)

Total tweets retrieved: 888


The above solution is too slow, let's try to optimize it...

In [168]:
from tqdm import tqdm

TABLE_COLUMN_NAMES = [
    "tweet_id",
    "user_id",
    "created_at",
    "text",
    "retweet_text",
]

# Example usage
user_ids_table = '../../../pappa/data/user_classification/user_age_gender_location.pkl'
db_file = '../mydata/database/myMENTALISM.db'
table_name = 'sample_tweets'
user_ids_to_retrieve = pd.read_pickle(user_ids_table).user_id.astype(int).tolist()
#user_ids_to_retrieve = [int('842452578738806784'), int('69150122')]
max_tweets_to_retrieve = 100
chunk_size = 1000
tweets_features_to_retrieve = ['tweet_id', 'created_at', 'text', 'retweet_text']
remove_columns = None
max_tweets_in_file = 1000
n_files=0

# Create a database connection
conn = sqlite3.connect(db_file)
cursor = conn.cursor()

# Get the total number of rows
cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
total_rows = cursor.fetchone()[0]

# Initialize an empty DataFrame
result_df = pd.DataFrame()

# Initialize a tqdm progress bar
progress_bar = tqdm(total=total_rows, unit="row", desc="Processing")

column_names = TABLE_COLUMN_NAMES

# Loop through the data in chunks
for offset in range(0, total_rows, chunk_size):
    # Query the database for a chunk of rows
    cursor.execute(f"SELECT * FROM {table_name} LIMIT {chunk_size} OFFSET {offset}")
    rows = cursor.fetchall()

    # Create a DataFrame from the fetched rows
    chunk_df = pd.DataFrame(rows, columns=column_names)
    
    # Remove the unwanted columns
    if remove_columns is not None:
        chunk_df = chunk_df.drop(columns=remove_columns)

    # Append to the result DataFrame
    # chunk_df['user_id'] = chunk_df['user_id'].astype(str)
    result_df = pd.concat([result_df, chunk_df[chunk_df['user_id'].isin(user_ids_to_retrieve)]], ignore_index=True)

    if len(result_df) >= max_tweets_in_file:
        # Save results to json
        print(f'Saving {len(result_df)} tweets to file {n_files}')
        result_df.to_pickle(f'bo{n_files}.pkl')
        # Reset the result DataFrame
        result_df = pd.DataFrame()
        n_files += 1

    # Update the progress bar
    progress_bar.update(len(rows))

# Close the tqdm progress bar
progress_bar.close()

# Close the database connection
conn.close()

Processing:   1%|          | 37000/4637193 [00:02<03:02, 25155.34row/s]

Saving file 0 with 1996 tweets
Saving file 1 with 1338 tweets


Processing:   1%|▏         | 65000/4637193 [00:02<01:33, 49155.11row/s]

Saving file 2 with 1315 tweets


Processing:   2%|▏         | 91000/4637193 [00:02<01:04, 70403.55row/s]

Saving file 3 with 1367 tweets


Processing:   4%|▎         | 165000/4637193 [00:03<00:56, 78615.96row/s]

Saving file 4 with 1870 tweets


Processing:   4%|▍         | 190000/4637193 [00:04<01:03, 69576.86row/s]

Saving file 5 with 1091 tweets


Processing:   5%|▌         | 239000/4637193 [00:04<01:16, 57697.48row/s]

Saving file 6 with 1916 tweets


Processing:   6%|▋         | 293000/4637193 [00:05<01:27, 49931.58row/s]

Saving file 7 with 1689 tweets
Saving file 8 with 1000 tweets


Processing:   7%|▋         | 319000/4637193 [00:06<01:34, 45523.58row/s]

Saving file 9 with 1915 tweets


Processing:   8%|▊         | 354000/4637193 [00:07<01:43, 41421.09row/s]

Saving file 10 with 1390 tweets


Processing:   8%|▊         | 378000/4637193 [00:07<01:48, 39139.66row/s]

Saving file 11 with 1810 tweets


Processing:   9%|▉         | 414000/4637193 [00:08<01:55, 36497.07row/s]

Saving file 12 with 1322 tweets
Saving file 13 with 1000 tweets


Processing:  10%|▉         | 450000/4637193 [00:10<02:03, 34005.21row/s]

KeyboardInterrupt: 

Processing:  10%|▉         | 450000/4637193 [00:27<02:03, 34005.21row/s]

In [166]:
len(result_df)

0

In [169]:
df = pd.read_pickle('bo0.pkl')
df

FileNotFoundError: [Errno 2] No such file or directory: 'bo0.pkl'

In [152]:
227486 / 4637193 * 600000000

29434099.464913364

In [156]:
600000000 / 4637193 / 10 * 82

1060.9866787946933

In [157]:
227486 * 10

2274860