In [76]:
import pandas as pd
import os
import numpy as np
import time
import pickle
from collections import Counter
import concurrent.futures
from datetime import datetime

In [4]:
# Import data

input_file_path = '/scratch/project_2006761/Milad_Sctraches/nut3_h3_dict.pkl'

# Load the dictionary from the pickle file
with open(input_file_path, 'rb') as file:
    nut3_h3_dict = pickle.load(file)

In [5]:
input_file_path = '/scratch/project_2006761/Milad_Sctraches/unique_user_ids.txt'

unique_user_ids = []
with open(input_file_path, 'r') as file:
    for line in file:
        unique_user_ids.append(line.strip())

In [19]:
len(unique_user_ids)

11735941

In [7]:
input_file_path = '/scratch/project_2006761/Milad_Sctraches/shapefiles/hex_gdf.pkl'

# Load the dictionary from the pickle file
with open(input_file_path, 'rb') as file:
    hex_gdf = pickle.load(file)

In [8]:
# Function to safely convert datetime columns to UTC
def safe_convert_to_utc(column):
    # Attempt to convert any strings or other formats to datetime first without specifying timezone
    column = pd.to_datetime(column, errors='coerce')
    # If already timezone-aware, convert to UTC
    if column.dt.tz is not None:
        return column.dt.tz_convert('UTC')
    else:  # If timezone-naive, localize to UTC
        return column.dt.tz_localize('UTC')

In [9]:
# Function to determine period based on date
def determine_period(date):
    global time_periods
    for period_name, (start, end) in period_dates.items():
        if start <= date <= end:
            return period_name
    return None

In [11]:
# Define the base directory path containing the period folders
base_directory_path = '/projappl/project_2006761/Twitter_Ate/eu-parquet-moves-only-max-180-days-final'

distance_threshold = 100 # KM

# For Cleaning
columns_to_keep = ['u_id', 'created_at_start', 'h3_grid_res10_start', 'NUTS_ID_start',
                       'created_at_end', 'h3_grid_res10_end', 'NUTS_ID_end']

period_dates = {
    'before_2016': (pd.Timestamp('2000-01-01').tz_localize('UTC'), pd.Timestamp('2015-12-31').tz_localize('UTC')),
    '2016_to_2019': (pd.Timestamp('2016-01-01').tz_localize('UTC'), pd.Timestamp('2019-12-31').tz_localize('UTC')),
    '2020_and_beyond': (pd.Timestamp('2020-01-01').tz_localize('UTC'), pd.Timestamp('2023-12-31').tz_localize('UTC'))
}

In [92]:
#user_id_of_interest = '151380704'

In [100]:


def process_user(user_id_of_interest):
    try:
        
        global distance_threshold, columns_to_keep, base_directory_path
        
        user_id_first_digit = user_id_of_interest[0]  # Get the first digit/character
        
        # Initialize an empty DataFrame to store data for the user of interest
        user_data = pd.DataFrame()
        
        # Define periods to iterate over
        periods = ['period=1', 'period=2']
        
        # Iterate only through the relevant directories based on the user ID's first digit
        for period in periods:
            user_digit_dir = f"user_first_digit={user_id_first_digit}"
            directory_path = os.path.join(base_directory_path, period, user_digit_dir)
            
            # Check if the specific directory exists before attempting to read files
            if os.path.exists(directory_path):
                for file in os.listdir(directory_path):
                    if file.endswith('.parquet'):
                        file_path = os.path.join(directory_path, file)
                        try:
                            # Attempt to read and filter the Parquet file
                            temp_df = pd.read_parquet(file_path, filters=[('u_id', '=', user_id_of_interest)])
                            
                            # Append the filtered data to the user_data DataFrame
                            # Check if temp_df is not empty and not all NA
                            if not temp_df.empty and not temp_df.isna().all().all():
                                # Only then, concatenate with user_data
                                user_data = pd.concat([user_data, temp_df], ignore_index=True)
                            
                        except Exception as e:
                            print(f"Failed to process {file_path}: {e}")
        
        # Celan the data
        user_data = user_data[columns_to_keep]
        
    
        # Apply the conversion/localization function to your datetime columns
        user_data['created_at_start'] = safe_convert_to_utc(user_data['created_at_start'])
        user_data['created_at_end'] = safe_convert_to_utc(user_data['created_at_end'])
        
        user_trips = user_data.copy()
        
        user_data['d_t'] = (user_data['created_at_end'] - user_data['created_at_start']).dt.total_seconds() / (2 * 86400)
        
        # Create the DataFrame for the 'start' information
        start_df = user_data[['u_id', 'created_at_start', 'h3_grid_res10_start', 'NUTS_ID_start', 'd_t']].copy()
        # Rename the columns to have a unified naming convention
        start_df.columns = ['u_id', 'created_at', 'h3_grid_res10', 'NUTS_ID', 'd_t']
        
        # Create the DataFrame for the 'end' information
        end_df = user_data[['u_id', 'created_at_end', 'h3_grid_res10_end', 'NUTS_ID_end', 'd_t']].copy()
        # Rename the columns to match the start_df
        end_df.columns = ['u_id', 'created_at', 'h3_grid_res10', 'NUTS_ID', 'd_t']
        
        # Concatenate the two DataFrames
        user_data = pd.concat([start_df, end_df], ignore_index=True).sort_values(by='created_at', ascending=True)
        
        # Group by 'created_at' and aggregate
        user_data = user_data.groupby('created_at').agg({
            'u_id': 'first',  # Keep the first u_id encountered
            'h3_grid_res10': 'first',  # Keep the first h3_grid_res10 encountered
            'NUTS_ID': 'first',  # Keep the first NUTS_ID encountered
            'd_t': 'sum'  # Sum the d_t values
        }).reset_index()
    
        # Fill the None values in NUTS_ID columns
        # Check if there are any NaN values in the NUTS_ID column
        if user_data['NUTS_ID'].isna().any():
            # Map h3_grid_res10 to NUTS_ID where NUTS_ID is NaN, retaining NaN for unmatched keys in nut3_h3_dict
            user_data.loc[user_data['NUTS_ID'].isna(), 'NUTS_ID'] = (
                user_data.loc[user_data['NUTS_ID'].isna(), 'h3_grid_res10']
                .map(nut3_h3_dict)
                .fillna(user_data['NUTS_ID'])
            )
    
        # Ther should not be any na, but incase
        user_data.dropna(inplace=True)
    
        # Go to the next user if the data is empty
        if user_data.empty:
            return
        
        # Calculate the significant places based on frequency
        
        # Identify the first and last date
        first_date = user_data['created_at'].iloc[0]
        last_date = user_data['created_at'].iloc[-1]
        
        if (last_date - first_date).days > 180:
            
            # Identify the first and last year
            start_year = first_date.year
            end_year = last_date.year
            
            # Initialize a list to hold pairs of NUTS_ID that meet the condition
            nuts_id_pairs = set()
            nuts_dwell_time = set()
    
            # For Cross-Border
            nuts_id_pairs_cross = set()
            nuts_dwell_time_cross = set()     
            
            # Loop through the range
            for year in range(start_year, end_year+1):
        
                # For the first year in the range, consider the year itself and the next two ones
                if year == start_year:
                    window_start = pd.to_datetime(f'{year}-01-01').tz_localize('UTC')
                    window_end = pd.to_datetime(f'{year+2}-12-31').tz_localize('UTC')
                # For the last year in the range, consider the year itself and the previous two ones
                elif year == end_year:
                    window_start = pd.to_datetime(f'{year-2}-01-01').tz_localize('UTC')
                    window_end = pd.to_datetime(f'{year}-12-31').tz_localize('UTC')
                # For years in between, include one year before and one after
                else:
                    window_start = pd.to_datetime(f'{year-1}-01-01').tz_localize('UTC')
                    window_end = pd.to_datetime(f'{year+1}-12-31').tz_localize('UTC')
            
                # Use window_start and window_end to filter or perform operations on user_data
                filtered_data = user_data[(user_data['created_at'] >= window_start) & (user_data['created_at'] <= window_end)]
                
                # Count the occurrences of each unique value in 'NUTS_ID_start'
                occurrences = filtered_data['NUTS_ID'].value_counts()
                # Calculate the 90th percentile value of the occurrences Series
                quantile_75 = occurrences.quantile(0.75)
                
                # Filter the Series to include only values above the 90th percentile
                top_quantile_h3 = occurrences[occurrences > quantile_75].index.tolist()
                
                # Filter the original DataFrame to include only rows with those h3_grid_res10 values
                filtered_top_quantile_data = filtered_data[filtered_data['NUTS_ID'].isin(top_quantile_h3)]
            
                filtered_top_quantile_data = filtered_top_quantile_data[['h3_grid_res10', 'NUTS_ID']].drop_duplicates()
            
                nuts_ids = filtered_top_quantile_data['NUTS_ID'].unique()
    
                # Filter only those we need to optimize
                hex_gdf_user = hex_gdf[hex_gdf['hex_id'].isin(filtered_top_quantile_data['h3_grid_res10'].unique())] 
                
                # Check if there's more than one unique NUTS_ID before proceeding
                if len(nuts_ids) > 1:
                    for i, nuts_id1 in enumerate(nuts_ids):
                        for nuts_id2 in nuts_ids[i+1:]:
                        
                            # Filter df for each pair of NUTS_ID
                            df1 = filtered_top_quantile_data[filtered_top_quantile_data['NUTS_ID'] == nuts_id1]
                            df2 = filtered_top_quantile_data[filtered_top_quantile_data['NUTS_ID'] == nuts_id2]
                            
                            # Calculate distances between all pairs of h3_grid_res10 in df1 and df2
                            for h3_index1 in df1['h3_grid_res10']:
                                for h3_index2 in df2['h3_grid_res10']:
    
                                    try: # In case there is a NaN value
                                        # Calculate the distance in meters
                                        # Extract the geometry for each hex_id directly
                                        geom1 = hex_gdf_user.loc[hex_gdf_user['hex_id'] == h3_index1, 'geometry'].values[0]
                                        geom2 = hex_gdf_user.loc[hex_gdf_user['hex_id'] == h3_index2, 'geometry'].values[0]
                                        
                                        # Calculate the distance between the two geometries
                                        distance = geom1.distance(geom2) / 1000
                                        
                                        # Check if the distance exceeds the threshold
                                        if distance > distance_threshold:
                                            # Add the pair of NUTS_ID to the list and break the loop
                                            nuts_id_pairs.add((year, nuts_id1, nuts_id2))
                                            nuts_dwell_time.add((year, nuts_id1, filtered_data[filtered_data.NUTS_ID==nuts_id1].d_t.sum()))
                                            nuts_dwell_time.add((year, nuts_id2, filtered_data[filtered_data.NUTS_ID==nuts_id2].d_t.sum()))
                                            break
                                    except:
                                        pass
                                else:
                                    # Continue if the inner loop wasn't broken
                                    continue
                                # Inner loop was broken, break the outer loop
                                break

                            if nuts_id1[:2] != nuts_id2[:2]:
                                nuts_id_pairs_cross.add((year, nuts_id1, nuts_id2))
                                nuts_dwell_time_cross.add((year, nuts_id1, filtered_data[filtered_data.NUTS_ID==nuts_id1].d_t.sum()))
                                nuts_dwell_time_cross.add((year, nuts_id2, filtered_data[filtered_data.NUTS_ID==nuts_id2].d_t.sum()))

                    
            # Initialize two dictionaries to hold the values for greater or equal to 90 and less than 90
            greater_equal_90 = {}
            #less_than_90 = {}
            
            # Iterate through the list and categorize based on the third item in each tuple
            for year, id, value in nuts_dwell_time:
                if value >= 90:
                    if year in greater_equal_90:
                        greater_equal_90[year].append(id)
                    else:
                        greater_equal_90[year] = [id]
                # else:
                #     if year in less_than_90:
                #         less_than_90[year].append(id)
                #     else:
                #         less_than_90[year] = [id]
            
            # Convert dictionaries to list of tuples
            sig_locations = [(year, ids) for year, ids in greater_equal_90.items()]
            #non_sig_locations = [(year, ids) for year, ids in less_than_90.items()]
    
            first_year = first_date.year
            last_year = last_date.year
            
            # Initialize a dictionary to count occurrences of each ID within a 5-year window
            window_counts = {}
            
            # Iterate through each year in the significant locations
            for year, ids in sig_locations:
                # Determine the window based on the year's position relative to the first and last years
                if year == first_year:
                    check_years = range(year, year + 5)  # First year: This year + next 4 years
                elif year == first_year + 1:
                    check_years = range(year - 1, year + 4)  # Second year: One year before + next 3 years
                elif year == last_year - 1:
                    check_years = range(year - 3, year + 2)  # Second to last year: Three years before + one year after
                elif year == last_year:
                    check_years = range(year - 4, year + 1)  # Last year: Four years before + this year
                else:
                    check_years = range(year - 2, year + 3)  # Other years: Two years before + two years after
            
                for check_year in check_years:
                    if check_year in [y for y, _ in sig_locations]:  # Check if the year is within the significant locations
                        for id in ids:
                            # Increment the count for this ID in the adjusted window
                            if id not in window_counts:
                                window_counts[id] = {year: 1}
                            else:
                                if year not in window_counts[id]:
                                    window_counts[id][year] = 1
                                else:
                                    window_counts[id][year] += 1
    
            # For Cross-Border commuting
            
            # Initialize two dictionaries to hold the values for greater or equal to 90 and less than 90
            greater_equal_90 = {}
            #less_than_90 = {}
            
            # Iterate through the list and categorize based on the third item in each tuple
            for year, id, value in nuts_dwell_time_cross:
                if value >= 90:
                    if year in greater_equal_90:
                        greater_equal_90[year].append(id)
                    else:
                        greater_equal_90[year] = [id]
                # else:
                #     if year in less_than_90:
                #         less_than_90[year].append(id)
                #     else:
                #         less_than_90[year] = [id]
            
            # Convert dictionaries to list of tuples
            sig_locations_cross = [(year, ids) for year, ids in greater_equal_90.items()]
    
            # Define the periods
            periods = {
                'before_2016': set(),
                '2016_to_2019': set(),
                '2020_and_beyond': set()
            }
    
            # Merge sig_locations into three time periods
            for year, ids in sig_locations:
                if year < 2016:
                    periods['before_2016'].update(ids)
                elif 2016 <= year <= 2019:
                    periods['2016_to_2019'].update(ids)
                else:  # 2020 and beyond
                    periods['2020_and_beyond'].update(ids)

            
            # Define the periods
            periods_cross = {
                'before_2016': set(),
                '2016_to_2019': set(),
                '2020_and_beyond': set()
            }
    
            # Merge sig_locations into three time periods
            for year, ids in sig_locations_cross:
                if year < 2016:
                    periods_cross['before_2016'].update(ids)
                elif 2016 <= year <= 2019:
                    periods_cross['2016_to_2019'].update(ids)
                else:  # 2020 and beyond
                    periods_cross['2020_and_beyond'].update(ids)
            
            # Get the primary location
            # Apply function to assign period to each row in DataFrame
            user_data['Period'] = user_data['created_at'].apply(determine_period)

            # Convert nuts dwell time to a df for faster computation to get sig_locations_dict
            df_temp = pd.DataFrame(nuts_dwell_time, columns=['Year', 'ID', 'Value'])
            
            # Define periods as a series of conditions and choices
            conditions = [
                df_temp['Year'] < 2016,
                df_temp['Year'].between(2016, 2019),
                df_temp['Year'] >= 2020
            ]
            choices = ['before_2016', '2016_to_2019', '2020_and_beyond']
            
            # Assign period based on year
            df_temp['Period'] = np.select(conditions, choices)
            
            # Group by Period and ID, then sum values
            grouped = df_temp.groupby(['Period', 'ID'])['Value'].sum().reset_index()
            
            # Find the ID with the highest value within each period
            idx = grouped.groupby(['Period'])['Value'].idxmax()
            highest_value_id_per_period = grouped.loc[idx]


            # Transform into a more lookup-friendly structure
            lookup_dict = {}
            for year, id1, id2 in nuts_id_pairs:
                key = frozenset([id1, id2])
                if key in lookup_dict:
                    lookup_dict[key].add(year)
                else:
                    lookup_dict[key] = {year}

            # Transform into a more lookup-friendly structure for Cross Border pairs
                lookup_dict_cross = {}
                for year, id1, id2 in nuts_id_pairs_cross:
                    key = frozenset([id1, id2])
                    if key in lookup_dict_cross:
                        lookup_dict_cross[key].add(year)
                    else:
                        lookup_dict_cross[key] = {year}
            
            '''
            MULTILOCAL and LONG DISTANCE MOBILITY TYPE
            
            '''
            
            # Filter IDs that have more than 2 occurrences in any 5-year window
            ml_ld_ids = [id for id, years in window_counts.items() if any(count > 2 for count in years.values())]
            
            # Initialize an empty set to store the triples
            multilocal_trips_user = set()

            # Initialize the set for LD_trips_user
            LD_trips_user = set()

            # Iterate through the periods and their corresponding IDs
            for period, ids_in_period in periods.items():
                # Find the intersection of multilocal_ids and the current period's IDs
                relevant_ids = set(ml_ld_ids).intersection(ids_in_period)
                
                # Proceed if we have more than one ID in the intersection, thus capable of forming a pair
                if len(relevant_ids) > 1:

                    home_id_series = highest_value_id_per_period.loc[highest_value_id_per_period['Period'] == period, 'ID']

                    if not home_id_series.empty:
                        home_NUTS_ID = home_id_series.iloc[0]
    
                        # Filter user_data
                        filtered_data = user_data[(user_data['NUTS_ID'].isin(relevant_ids)) & (user_data['Period'] == period)]
                        # Create an independent copy of filtered_data to avoid SettingWithCopyWarning
                        filtered_data = filtered_data.copy()
                        filtered_data['Prev_NUTS_ID'] = filtered_data['NUTS_ID'].shift(1)
                        filtered_data = filtered_data[filtered_data['NUTS_ID'] != filtered_data['Prev_NUTS_ID']].drop(columns='Prev_NUTS_ID')

                        if len(filtered_data) > 1:
    
                            # Create pairs of consecutive NUTS_ID
                            filtered_data['Next_NUTS_ID'] = filtered_data['NUTS_ID'].shift(-1)
                            pairs = filtered_data[['NUTS_ID', 'Next_NUTS_ID']].dropna().apply(frozenset, axis=1)
        
                            # Filter pairs to only include those with home_NUTS_ID
                            filtered_pairs = pairs[pairs.apply(lambda x: home_NUTS_ID in x)]
        
                            # Count each unique pair
                            pair_counts = Counter(filtered_pairs)
                            
                            # Total count of pairs
                            total_pairs = sum(pair_counts.values())
        
                            if total_pairs > 0:
                            
                                # Find the pair with the highest count
                                most_common_pair, most_common_count = pair_counts.most_common(1)[0]
                                
                                # Calculate the ratio
                                ratio = most_common_count / total_pairs          
                                
                                # Check if the ratio of the most common pair is more than 0.5
                                if ratio > 0.5:
                                    # Extract the other NUTS ID in the pair besides home_NUTS_ID
                                    other_NUTS_ID = next(iter(most_common_pair - frozenset([home_NUTS_ID])))

                                    # Perform lookup
                                    pair_key = frozenset([home_NUTS_ID, other_NUTS_ID])
                                    if pair_key in lookup_dict:
                                        # Use period from the outer loop, home_NUTS_ID and other_NUTS_ID for the triple
                                        LD_trips_user.add((period, home_NUTS_ID, other_NUTS_ID))

                                else:
                                    # If the ratio does not exceed 0.5, add pairs from filtered_pairs to multilocal_trips_user
                                    for pair in filtered_pairs:
                                        # Ensure the pair includes home_NUTS_ID and extract the other NUTS_ID
                                        other_NUTS_ID = next(iter(pair - frozenset([home_NUTS_ID])))
                                        # Perform lookup
                                        pair_key = frozenset([home_NUTS_ID, other_NUTS_ID])
                                        if pair_key in lookup_dict:
                                            # Use period from the outer loop, home_NUTS_ID and other_NUTS_ID for the triple
                                            multilocal_trips_user.add((period, home_NUTS_ID, other_NUTS_ID))
                                        

            
            '''
            NOMAD MOBILITY TYPE
            
            '''
            
            # Convert nuts_dwell_time to a more accessible structure
            dwell_time_dict = {}
            for year, id, time_ in nuts_dwell_time:
                if id not in dwell_time_dict:
                    dwell_time_dict[id] = {}
                dwell_time_dict[id][year] = time_
            
            nomad_ids = []
            for id, years_counts in window_counts.items():
                years = list(years_counts.keys())
                # Check for IDs with exactly one occurrence
                if any(count == 1 for count in years_counts.values()):
                    nomad_ids.append(id)
                
                # Check for IDs with exactly two occurrences in consecutive years and total time <= 180
                elif sum(years_counts.values()) == 2 and len(years)>1:
                    if abs(years[0] - years[1]) == 1:  # Consecutive years
                        total_time = sum(dwell_time_dict[id].get(year, 0) for year in years)
                        if total_time <= 365:
                            nomad_ids.append(id)
                            
            # Initialize an empty list to store the triples
            nomad_trips_user = set()           
            
            # 3 Significant locations needed at least
            if len(nomad_ids) > 2:

                
                
                # Iterate through each period
                for period, ids_in_period in periods.items():
        
                    # Filter IDs for this period
                    filtered_ids = user_data[(user_data['NUTS_ID'].isin(nomad_ids)) & (user_data['Period'] == period)]
                    
                    # Drop duplicates and sort
                    filtered_ids = filtered_ids.drop_duplicates(subset=['NUTS_ID'], keep='first')
                    
                    # Extract sorted NUTS_IDs
                    sorted_ids = filtered_ids['NUTS_ID'].tolist()
        
                    if len(sorted_ids) > 1:
                        
                        # Create triples by iterating through sorted_ids and pairing each ID with the next one
                        for i in range(len(sorted_ids) - 1):
                            start_id = sorted_ids[i]
                            end_id = sorted_ids[i + 1]
                            
                            # Perform lookup
                            pair_key = frozenset([start_id, end_id])
                            if pair_key in lookup_dict:
                                # Use period from the outer loop, start_id and end_id for the triple
                                nomad_trips_user.add((period, start_id, end_id))
    
    
            '''
            Cross Border
        
            '''
    
            CB_ids = set() # Cross Border
            for _, ids in sig_locations_cross:
                for id in ids:
                    CB_ids.add(id)
            CB_ids = list(CB_ids)

            # Initialize an empty list to store the triples
            CB_trips_user = set()

            # Iterate through the periods and their corresponding IDs
            for period, ids_in_period in periods_cross.items():
                # Find the intersection of multilocal_ids and the current period's IDs
                relevant_ids = set(CB_ids).intersection(ids_in_period)
                
                # Proceed if we have more than one ID in the intersection, thus capable of forming a pair
                if len(relevant_ids) > 1:

                    home_id_series = highest_value_id_per_period.loc[highest_value_id_per_period['Period'] == period, 'ID']

                    if not home_id_series.empty:
                        home_NUTS_ID = home_id_series.iloc[0]
    
                        # Filter user_data
                        filtered_data = user_data[(user_data['NUTS_ID'].isin(relevant_ids)) & (user_data['Period'] == period)]
                        # Create an independent copy of filtered_data to avoid SettingWithCopyWarning
                        filtered_data = filtered_data.copy()
                        filtered_data['Prev_NUTS_ID'] = filtered_data['NUTS_ID'].shift(1)
                        filtered_data = filtered_data[filtered_data['NUTS_ID'] != filtered_data['Prev_NUTS_ID']].drop(columns='Prev_NUTS_ID')

                        if len(filtered_data) > 1:
    
                            # Create pairs of consecutive NUTS_ID
                            filtered_data['Next_NUTS_ID'] = filtered_data['NUTS_ID'].shift(-1)
                            pairs = filtered_data[['NUTS_ID', 'Next_NUTS_ID']].dropna().apply(frozenset, axis=1)
        
                            # Filter pairs to only include those with home_NUTS_ID
                            filtered_pairs = pairs[pairs.apply(lambda x: home_NUTS_ID in x)]
        
                            for pair in filtered_pairs:
                                # Extract the other NUTS_ID
                                other_NUTS_ID = next(iter(pair - frozenset([home_NUTS_ID])))
                                if home_NUTS_ID[:2] != other_NUTS_ID[:2]: # If they are from different countries
                                    # Perform lookup
                                    pair_key = frozenset([home_NUTS_ID, other_NUTS_ID])
                                    if pair_key in lookup_dict_cross:
                                        # Use period from the outer loop, home_NUTS_ID and other_NUTS_ID for the triple
                                        CB_trips_user.add((period, home_NUTS_ID, other_NUTS_ID))
    
                                    
            
    
            '''
            Finally return the results
            '''

            return multilocal_trips_user, nomad_trips_user, CB_trips_user, LD_trips_user

        else:
            return
        
    except Exception as e:
        print(f"Error: {e}")  # This will print the error message
        print(user_id_of_interest)
        return


In [101]:
# Function to apply updates in bulk
def apply_updates_to_dataframes(updates_list):
    # Global DataFrames
    global multilocal_trips, nomad_trips, CB_trips, LD_trips
    
    for user_updates in updates_list:
        multilocal_trips_user, nomad_trips_user, CB_trips_user, LD_trips_user = user_updates
    
        # Iterate over the triples
        for year, nuts_id1, nuts_id2 in multilocal_trips_user:
            # Check if this combination exists in the DataFrame
            match = multilocal_trips[(multilocal_trips['NUTS_ID1'] == nuts_id1) & 
                                   (multilocal_trips['NUTS_ID2'] == nuts_id2) & 
                                   (multilocal_trips['Period'] == year)]
            
            if not match.empty:
                # If exists, increment the 'Count'
                multilocal_trips.loc[match.index, 'Count'] += 1
            else:
                # If not exists, create a new DataFrame for the row and concatenate it
                new_row_df = pd.DataFrame({'NUTS_ID1': [nuts_id1], 'NUTS_ID2': [nuts_id2], 'Period': [year], 'Count': [1]})
                multilocal_trips = pd.concat([multilocal_trips, new_row_df], ignore_index=True)
        
        
        # Iterate over the triples 
        for year, nuts_id1, nuts_id2 in nomad_trips_user:
            # Check if this combination exists in the DataFrame
            match = nomad_trips[(nomad_trips['NUTS_ID1'] == nuts_id1) & 
                                   (nomad_trips['NUTS_ID2'] == nuts_id2) & 
                                   (nomad_trips['Period'] == year)]
            
            if not match.empty:
                # If exists, increment the 'Count'
                nomad_trips.loc[match.index, 'Count'] += 1
            else:
                # If not exists, create a new DataFrame for the row and concatenate it
                new_row_df = pd.DataFrame({'NUTS_ID1': [nuts_id1], 'NUTS_ID2': [nuts_id2], 'Period': [year], 'Count': [1]})
                nomad_trips = pd.concat([nomad_trips, new_row_df], ignore_index=True)
    
    
        # Iterate over the triples 
        for year, nuts_id1, nuts_id2 in CB_trips_user:
            # Check if this combination exists in the DataFrame
            match = CB_trips[(CB_trips['NUTS_ID1'] == nuts_id1) & 
                                   (CB_trips['NUTS_ID2'] == nuts_id2) & 
                                   (CB_trips['Period'] == year)]
            
            if not match.empty:
                # If exists, increment the 'Count'
                CB_trips.loc[match.index, 'Count'] += 1
            else:
                # If not exists, create a new DataFrame for the row and concatenate it
                new_row_df = pd.DataFrame({'NUTS_ID1': [nuts_id1], 'NUTS_ID2': [nuts_id2], 'Period': [year], 'Count': [1]})
                CB_trips = pd.concat([CB_trips, new_row_df], ignore_index=True)

        
        # Iterate over the triples 
        for year, nuts_id1, nuts_id2 in LD_trips_user:
            # Check if this combination exists in the DataFrame
            match = LD_trips[(LD_trips['NUTS_ID1'] == nuts_id1) & 
                                   (LD_trips['NUTS_ID2'] == nuts_id2) & 
                                   (LD_trips['Period'] == year)]
            
            if not match.empty:
                # If exists, increment the 'Count'
                LD_trips.loc[match.index, 'Count'] += 1
            else:
                # If not exists, create a new DataFrame for the row and concatenate it
                new_row_df = pd.DataFrame({'NUTS_ID1': [nuts_id1], 'NUTS_ID2': [nuts_id2], 'Period': [year], 'Count': [1]})
                LD_trips = pd.concat([LD_trips, new_row_df], ignore_index=True)

In [None]:
# Start the timer
start_time = time.time()

print(f"Start time is {datetime.now()}")

num_workers = 8

# Assuming these are your global DataFrames initialized somewhere above
multilocal_trips = pd.DataFrame(columns=['NUTS_ID1', 'NUTS_ID2', 'Period', 'Count'])
nomad_trips = pd.DataFrame(columns=['NUTS_ID1', 'NUTS_ID2', 'Period', 'Count'])
CB_trips = pd.DataFrame(columns=['NUTS_ID1', 'NUTS_ID2', 'Period', 'Count'])
LD_trips = pd.DataFrame(columns=['NUTS_ID1', 'NUTS_ID2', 'Period', 'Count'])

# Placeholder for updates collected from all users
all_user_updates = []

# Use ProcessPoolExecutor to parallelize processing
with concurrent.futures.ProcessPoolExecutor(max_workers=num_workers) as executor:

    # Map process_user function to each user ID, and iterate over results as they become available
    all_user_updates.extend(result for result in executor.map(process_user, unique_user_ids[:120000]) if result is not None)
    
# After collecting all updates, apply them to the global DataFrames
apply_updates_to_dataframes(all_user_updates)

# Stop the timer
end_time = time.time()
print(f"End time is {datetime.now()}")
print("-"*50)

# Calculate the total time taken
total_time = end_time - start_time
print(f"Total time taken to process the data: {total_time} seconds")

Start time is 2024-04-06 17:31:36.076416


In [81]:
apply_updates_to_dataframes(all_user_updates)

In [95]:
59709.025122/37862*11000000/60/60/24

200.7776524267222

In [123]:
18*60*60

64800

In [147]:
(end_time - start_time)/120000*11000000/60/60/24/

76.7026971290463

In [149]:
77/14

5.5

In [130]:
nomad_trips

Unnamed: 0,NUTS_ID1,NUTS_ID2,Period,Count


In [133]:
multilocal_trips.Count.sum()

45

In [134]:
CB_trips.Count.sum()

326

In [129]:
apply_updates_to_dataframes(all_user_updates)

In [None]:
multilocal_trips.to_csv('multilocal_trips_v4.csv')

In [None]:
nomad_trips.to_csv('nomad_trips_v4.csv')

In [None]:
CB_trips.to_csv('CB_trips_v4.csv')

In [None]:
LD_trips.to_csv('LD_trips_v4.csv')

In [143]:
nomad_trips

Unnamed: 0,NUTS_ID1,NUTS_ID2,Period,Count


In [142]:
multilocal_trips

Unnamed: 0,NUTS_ID1,NUTS_ID2,Period,Count
0,ES300,ES523,before_2016,1
1,ES300,ES709,before_2016,1
2,ES120,ES111,2016_to_2019,1
3,ES120,ES300,2016_to_2019,1
4,DE501,DE712,2016_to_2019,1
...,...,...,...,...
84,ES705,ES220,2016_to_2019,1
85,ES130,ES220,before_2016,1
86,ES705,ES213,2016_to_2019,1
87,ES130,ES213,before_2016,1


In [132]:
5334.927958250046/10000*11000000/60/60/24

Unnamed: 0,NUTS_ID1,NUTS_ID2,Period,Count


In [140]:
CB_trips

Unnamed: 0,NUTS_ID1,NUTS_ID2,Period,Count
0,ES300,UKI71,before_2016,1
1,DE501,ES512,before_2016,1
2,DED41,ITF64,before_2016,1
3,DE128,PL524,before_2016,1
4,FR101,ES300,before_2016,2
...,...,...,...,...
648,AT334,DE714,2020_and_beyond,1
649,TR321,ITG17,before_2016,1
650,TR100,UKI45,before_2016,1
651,TR823,LT022,before_2016,1


In [141]:
LD_trips

Unnamed: 0,NUTS_ID1,NUTS_ID2,Period,Count
0,ES300,UKI71,before_2016,1
1,DE501,ES512,before_2016,1
2,ES612,ES300,before_2016,3
3,DED41,ITF64,before_2016,1
4,SE121,SE313,before_2016,1
...,...,...,...,...
2259,TR611,TR221,2016_to_2019,1
2260,FRL04,FRI32,2020_and_beyond,1
2261,UKE42,UKL22,2016_to_2019,1
2262,FR101,FRI32,2020_and_beyond,1


In [139]:
5334.927958250046/10000*400000/60/60

59.27697731388939