In [1]:
import pandas as pd
import numpy as np
import os
from datetime import datetime

import warnings
warnings.filterwarnings("ignore")

# Merge log files together 

The CSV files contain query results for individual grids within the selected period. To facilitate further exploration, we merge these files together.

In [3]:
def filelist(root):
    allfiles = []
    for path, subdirs, files in os.walk(root):
        for name in files:
            allfiles.append(os.path.join(path, name))
    return allfiles

In [297]:
df = pd.concat([pd.read_csv(csv_file) for csv_file in filelist("longterm/2010-2012") if csv_file[-3:]=='csv'])
df = df.drop(columns=['Unnamed: 0'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45298 entries, 0 to 53
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   grid_id        45298 non-null  int64  
 1   tweet_id       45298 non-null  object 
 2   created_at     45298 non-null  object 
 3   text           45298 non-null  object 
 4   author_id      45292 non-null  object 
 5   place_id       38677 non-null  object 
 6   long           34539 non-null  object 
 7   lat            34539 non-null  float64
 8   full_location  38677 non-null  object 
 9   location       38671 non-null  object 
 10  location_type  38671 non-null  object 
 11  assign_long    45286 non-null  float64
 12  assign_lat     45286 non-null  float64
dtypes: float64(3), int64(1), object(9)
memory usage: 4.8+ MB


In [20]:
period = '2010-01-01_2012-12-31'
df.to_csv(f'1_raw_long/{period}.csv') 

# Create a unique database by deduplicating

Due to the overlap between query periods and the splitting of geo grids, some tweets may be duplicated. To address this, we will use the unique tweet_id to remove duplicates and merge all records into a single dataset.

In [4]:
merge_df = pd.concat([pd.read_csv(csv_file) for csv_file in filelist("1_raw_long") 
                      if csv_file[-3:]=='csv'], ignore_index=True)
merge_df.head()

Unnamed: 0.1,Unnamed: 0,grid_id,tweet_id,created_at,text,author_id,place_id,long,lat,full_location,location,location_type,assign_long,assign_lat
0,0,3666.0,1497426911743541250,2022-02-26 04:22:51,@PeteAbe @SheilaMullowney I think I would like...,3807247214,0148540119dc25ab,,,"Calimesa, CA",Calimesa,city,-117.066854,33.973347
1,1,3666.0,1494876573517840388,2022-02-19 03:28:43,Big Bear にあるAlpine Slideへ！楽しかった〜😊長い滑り台がいくつかあるテ...,68460337,0148540119dc25ab,,,"Calimesa, CA",Calimesa,city,-117.066854,33.973347
2,0,3672.0,1529508813010632705,2022-05-25 17:04:52,@laxietoo @catboerner @NatashaBertrand @anders...,1310770558317948928,006bbe08633392ba,,,"Desert Edge, CA",Desert Edge,city,-116.418073,33.954765
3,1,3672.0,1524747843528847361,2022-05-12 13:46:28,@LadyPieLives Because the 2nd Amendment guaran...,1310770558317948928,006bbe08633392ba,,,"Desert Edge, CA",Desert Edge,city,-116.418073,33.954765
4,2,3672.0,1503037388523593730,2022-03-13 15:56:53,@NorwichCadets @NorwichWLax @SJCmonks #GoWick ...,18413788,006bbe08633392ba,,,"Desert Edge, CA",Desert Edge,city,-116.418073,33.954765


In [5]:
# Drop unnecessary columns, drop Nan values, change data type and reset index
merge_df = merge_df.drop(columns=['Unnamed: 0'])
merge_df.dropna(subset=['tweet_id','text','assign_long','assign_lat'],inplace=True)
merge_df = merge_df.astype({'tweet_id':'str'}).reset_index(drop=True)

merge_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 326330 entries, 0 to 326329
Data columns (total 13 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   grid_id        326330 non-null  float64
 1   tweet_id       326330 non-null  object 
 2   created_at     326330 non-null  object 
 3   text           326330 non-null  object 
 4   author_id      326330 non-null  object 
 5   place_id       318083 non-null  object 
 6   long           168237 non-null  object 
 7   lat            168237 non-null  float64
 8   full_location  317974 non-null  object 
 9   location       317974 non-null  object 
 10  location_type  317974 non-null  object 
 11  assign_long    326330 non-null  float64
 12  assign_lat     326330 non-null  float64
dtypes: float64(4), object(9)
memory usage: 32.4+ MB


In [6]:
len(merge_df.tweet_id.unique())

316750

In [7]:
def deduplicate(raw):
    curr = raw[0][1] #tweet_id
    longs, lats = [raw[0][-2]], [raw[0][-1]] # long, lat
    temp = raw[0]
    results = []

    for i in range(1, len(raw)):
        if curr == raw[i][1]: #tweet_id
            longs.append(raw[i][-2])
            lats.append(raw[i][-1])
        else:
            temp = raw[i-1]
            if len(longs) > 1:
                temp[-2] = np.mean(longs)
                temp[-1] = np.mean(lats)
            results.append(temp)
            
            longs, lats = [raw[i][-2]], [raw[i][-1]]
            curr = raw[i][1]
            
        if i == len(raw)-1:
            results.append(raw[i])
            
    return results

In [8]:
merge_df = merge_df.sort_values(by=['tweet_id'])
raw = merge_df.values.tolist()[1:]
results = deduplicate(raw)
len(results)

316749

# Reformat and Save the Database

In order to improve data quality and standardize the data structure, we modify the database schema, adjust data types, and merge columns related to geographic information - if an original coordinate is available, we retain it; otherwise, we use the assigned coordinate.

Once these changes are made, we convert the data to a dataframe and save it as a CSV file, which can be easily imported into Spark for assigning unified county labels.

In [9]:
def isfloat(num):
    if str(num)=='nan':
        return False
    try:
        float(num)
        return True
    except ValueError:
        return False

In [10]:
def reformat(results):
    data = []
    for line in results:
        temp = [''] * 8
        temp[0] = str(line[1])
        temp[1] = datetime.strptime(line[2], '%Y-%m-%d %H:%M:%S')
        temp[2:5] = line[3:6]
        temp[5] = line[8]
        
        # merge geo columns
        if isfloat(line[6]):
            temp[-2] = float(line[6])
        else: 
            temp[-2] = line[-2]
        if isfloat(line[7]):
            temp[-1] = float(line[7])
        else: 
            temp[-1] = line[-1]
        data.append(temp)
    return data

In [11]:
data = reformat(results)
data[:5]

[['100001445859958785',
  datetime.datetime(2011, 8, 7, 0, 32, 57),
  "Bear Grylls just bit a trout's head off!",
  '18003609',
  'fbd6d2f5a4e4a15e',
  'California, USA',
  -120.645794,
  35.2478663],
 ['1000015974010007552',
  datetime.datetime(2018, 5, 25, 14, 9, 22),
  '@StacyGSG love the Jets t-shirt on the bear',
  596144748,
  'a592bd6ceb1319f7',
  'San Diego, CA',
  -117.1097307686585,
  32.80103765390325],
 ['1000016540027142144',
  datetime.datetime(2018, 5, 25, 14, 11, 37),
  "Kid: what is this this is boring. I wish it was Mickey Mouse club.\nParents: be quiet and watch the movie.\nKid:...\nKid: I guess this is okay...uh mom? Dad?\nDad: [/openly sobbing] POOH BEAR I'M COMING HOME 😭😢\nmom: OH PIGLET SWEETHEART 😭😭\nKid: 🤨😮😕\n#WinnieThePooh https://t.co/5CW04DuRLo",
  31197211,
  '0c2e6999105f8070',
  'Anaheim, CA',
  -117.8290634317465,
  33.8100699306916],
 ['1000020809799254018',
  datetime.datetime(2018, 5, 25, 14, 28, 35),
  'Next weekend’s Gold State is also a fund raiser

In [12]:
header = ['tweet_id', 'created_datetime', 'content', 'author_id', \
          'place_id', 'location', 'longitude', 'latitude']

final = pd.DataFrame(data, columns=header)
final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 316749 entries, 0 to 316748
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   tweet_id          316749 non-null  object        
 1   created_datetime  316749 non-null  datetime64[ns]
 2   content           316749 non-null  object        
 3   author_id         316749 non-null  object        
 4   place_id          308781 non-null  object        
 5   location          308672 non-null  object        
 6   longitude         316749 non-null  float64       
 7   latitude          316749 non-null  float64       
dtypes: datetime64[ns](1), float64(2), object(5)
memory usage: 19.3+ MB


In [14]:
final.to_csv(f'1_raw_long/total_long.csv') 