In [1]:
import pandas as pd
import ast

In [2]:
df_stops_total = pd.read_csv('../final-tables/stops_table.csv')
df_1976 = pd.read_csv('missing-stops-info-1976-initial.csv')

In [3]:
df_1976.head(3)

Unnamed: 0.1,Unnamed: 0,stop_name,line_count,type,in_lines
0,0,Schlesisches Tor,1,u-bahn,['1']
1,1,Görlitzer Bahnhof (Oranienstr.),1,u-bahn,['1']
2,2,Kottbusser Tor,2,u-bahn,"['8', '1']"


In [4]:
df_1976.drop(columns=['Unnamed: 0'], inplace=True)
df_1976["location"] = ""
df_1976["identifier"] = ""
df_1976["stop_id_other"] = ""
df_1976["previous_in_lines"] = ""
df_1976["stop_description"] = ""

In [5]:
df_stops_total.tail(3)

Unnamed: 0,stop_id,stop_name,type,location,in_lines,identifier,stop_description
19728,19892665,neuhofer Strasse,bus,"52.411694901653036, 13.503489970696608",{'58'},,
19729,19892666,Öschebronner Weg,bus,"52.61154475375042, 13.337669105320359",{'20'},,
19730,19892667,Übergang,bus,"52.543078667825824, 13.157884773521127",{'63'},,


In [6]:
# Function to convert string representation of set to list
def convert_to_list(value):
    try:
        # Use ast.literal_eval to safely convert string to set
        items = ast.literal_eval(value)
        
        # Check if the result is a set, and convert it to a list
        if isinstance(items, set):
            return list(items)
        # Check if the result is an integer, and convert it to a list with a single element
        elif isinstance(items, int):
            return [items]
        # For other types, return a list with the original value
        else:
            return [value]
    except (SyntaxError, ValueError):
        # Handle the case where literal_eval fails (e.g., invalid string)
        # Try to convert the value to an integer, and return a list with a single element if successful
        try:
            return [int(value)]
        except ValueError:
            return [value]

# Apply the function to the 'in_lines' column
df_stops_total['in_lines'] = df_stops_total['in_lines'].apply(convert_to_list)
df_1976['in_lines'] = df_1976['in_lines'].apply(convert_to_list)

In [7]:
df_stops_total['in_lines'] = df_stops_total['in_lines'].apply(lambda x: ','.join(map(str, x)))
df_1976['in_lines'] = df_1976['in_lines'].apply(lambda x: ','.join(map(str, x)))

In [8]:
df_stops_total.head(3)

Unnamed: 0,stop_id,stop_name,type,location,in_lines,identifier,stop_description
0,19460,Adlershof,s-bahn,"52.434722222222,13.541388888889","['KBS 100a', 'KBS 100c']",Q323551,
1,19461,Akazienallee,bus,"52.47849809201335, 13.607179083668733",['D'],,
2,19462,Alexanderplatz,bus,"52.52005246558073, 13.413957243449447","['1', '9']",,


I have two dataframes df_long and df_short, I want to add the information from df_long columns combined_location and combined_identifier to df_short if there is a match between the original_name column from df_long with the stop_name column of df_short. I have to create combined_location and combined_identifier columns in df_short first

In [9]:
for index, row in df_1976.iterrows():
    # Check if row["stop_name"] is in df_1960["original_name"]
    if row["stop_name"] in df_stops_total["stop_name"].values:
        # Find matching rows in df_1960
        matching_rows = df_stops_total[df_stops_total["stop_name"] == row["stop_name"]]
        
        # Check if those two rows have matching row["type"] values
        matching_rows = matching_rows[matching_rows["type"] == row["type"]]
        
        # Check if there are any matching rows
        if not matching_rows.empty:
            df_1976.at[index, 'location'] = '+ '.join(matching_rows['location'].astype(str).values)
            df_1976.at[index, 'identifier'] = '+ '.join(matching_rows['identifier'].astype(str).values)
            df_1976.at[index, 'stop_description'] = '+ '.join(matching_rows['stop_description'].astype(str).values)
            df_1976.at[index, 'stop_id_other'] = '+ '.join(matching_rows['stop_id'].astype(str).values)
            df_1976.at[index, 'previous_in_lines'] = '+ '.join(matching_rows['in_lines'].astype(str).values)
        else:
            # Handle the case where there are no matching rows in df_1960
            df_1976.at[index, 'location'] = ''
            df_1976.at[index, 'identifier'] = ''
            df_1976.at[index, 'stop_description'] = ''
            df_1976.at[index, 'stop_id_other'] = ''
            df_1976.at[index, 'previous_in_lines'] = ''




        # # Convert values to strings and replace NaN with empty strings
        # combined_location = " + ".join(matching_rows["combined_location"].astype(str).fillna(''))
        # combined_identifier = " + ".join(matching_rows["combined_identifier"].astype(str).fillna(''))
            
        # # Update row["location"] and row["identifier"]
        # row["location"] += " + " + combined_location
        # row["identifier"] += " + " + combined_identifier

# Now, df_1961 should have updated "location" and "identifier" columns based on the conditions.


In [10]:
# Function to split string values containing '+' and convert to a unique string
def split_values(cell_value):
    unique_values = sorted(set(element.strip() for element in cell_value.split('+')))
    return '+'.join(unique_values)

# Apply the function to each cell in the specified columns
df_1976['location'] = df_1976['location'].apply(split_values)
df_1976['identifier'] = df_1976['identifier'].apply(split_values)

In [11]:
df_1976.drop(columns=['line_count', "stop_id_other"], inplace=True)

In [12]:
# Function to concatenate unique values in other columns
def concat_unique_values(series):
    unique_values = set(series)
    return '+ '.join(unique_values)

# Group by specified columns and aggregate the other columns
df_1976 = df_1976.groupby(['stop_name', 'type', 'in_lines', 'location']).agg({
    'identifier': concat_unique_values,
    'previous_in_lines': concat_unique_values,
    'stop_description': concat_unique_values
}).reset_index()

In [13]:
df_1976.to_csv('missing-1976-stops-combined.csv')

In [14]:
df_1976.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   stop_name          180 non-null    object
 1   type               180 non-null    object
 2   in_lines           180 non-null    object
 3   location           180 non-null    object
 4   identifier         180 non-null    object
 5   previous_in_lines  180 non-null    object
 6   stop_description   180 non-null    object
dtypes: object(7)
memory usage: 10.0+ KB


In [15]:
df_1976.head(10)

Unnamed: 0,stop_name,type,in_lines,location,identifier,previous_in_lines,stop_description
0,Afrikanische Str. (Friedrich-Ebert-Siedlung),u-bahn,['6'],"52.560027777778,13.334633333333",Q559239,['6']+ ['6']+ ['6'],nan+ nan+ nan
1,Alt-Mariendorf,u-bahn,['6'],"52.4391,13.3877",Q262168,['6']+ ['6']+ ['6']+ ['6']+ 6+ 6+ 6,nan+ nan+ nan+ nan+ nan+ nan+ nan
2,Alt-Tempelhof,u-bahn,['6'],"52.465952777778,13.385897222222",Q559212,['6']+ ['6']+ ['6']+ ['6']+ 6+ 6+ 6,nan+ nan+ nan+ nan+ nan+ nan+ nan
3,Amrumer Str. (Rudolf-Virchow-Krankenhaus),u-bahn,['9'],"52.5425,13.350277777778",Q557577,['9']+ ['9']+ ['9'],nan+ nan+ nan
4,Anhalter Bahnhof,s-bahn,"['KBS 153', 'KBS 154', 'KBS 152']","52.503055555556,13.381944444444",Q111324573,"['KBS 105a', 'KBS 103', 'KBS 104a']+ ['KBS 105...",nan+ nan+ nan+ S-Bahnhof Berlin Anhalter Bahnh...
5,Augsburger Str.,u-bahn,['2'],"52.500556,13.336389",Q667035,"A II,B II+ A II+ ['2']+ ['2']+ ['2']",nan+ nan+ nan+ nan+ nan
6,Bahnhof Friedrichstr. (Stadtbahn),u-bahn,['6'],"52.520111111111,13.388305555556",Q3753274,"['C I']+ C II,C I+ C I+ C I+ ['6']+ ['6']+ ['6']",nan+ Friedrichstraße metro station+ Friedrichs...
7,Bahnhof Wedding,u-bahn,['6'],"52.5425,13.366111",Q4018733,"['C I']+ ['C I']+ C II,C I+ C I+ C I+ ['6']+ [...",nan+ nan+ U-Bahnhof Wedding+ U-Bahnhof Wedding...
8,Bayerischer Platz,u-bahn,"['4', '7']","52.488333333333,13.340277777778",Q668574,['B1']+ ['B I']+ ['B II']+ B III+ B III+ B I+ ...,nan+ nan+ nan+ Bayerischer Platz station+ Baye...
9,Bellevue,s-bahn,"['KBS 156', 'KBS 157']","52.52,13.348055555556",Q374018,"['KBS 101', 'KBS 100c']+ ['KBS 101', 'KBS 103'...",nan+ nan+ nan+ Berlin Bellevue station+ Berlin...
