In [131]:
import pandas as pd
import ast

In [132]:
df_stops_total = pd.read_csv('../final-tables/stops_table.csv')
df_1956 = pd.read_csv('stops_df_1956-initial.csv')

In [133]:
df_1956.head(3)

Unnamed: 0.1,Unnamed: 0,stop_name,line_count,type,in_lines
0,0,Schillingbrücke (Ostbahnhof),1,strassenbahn,['1']
1,1,Stalinallee,8,strassenbahn,['1']
2,2,Stalinallee,8,u-bahn,['E']


In [134]:
df_1956.drop(columns=['Unnamed: 0'], inplace=True)
df_1956["location"] = ""
df_1956["identifier"] = ""
df_1956["stop_id_other"] = ""
df_1956["previous_in_lines"] = ""
df_1956["stop_description"] = ""

In [135]:
df_stops_total.tail(3)

Unnamed: 0,stop_id,stop_description,type,in_lines,stop_name,location,identifier
4231,19641453,Zum Heckeshorn Ecke Am Großen Wannsee,bus,A51,Zum Heckeshorn,"52.43307505069834, 13.164008736441371",
4232,19641454,,bus,A63,Zweibrücker Strasse,"52.54677409100436, 13.188764018875105",
4233,19641455,,bus,AS,Zühlsdorf,"52.7162250652868, 13.366925671584623",


In [136]:
# Function to convert string representation of set to list
def convert_to_list(value):
    try:
        # Use ast.literal_eval to safely convert string to set
        items = ast.literal_eval(value)
        
        # Check if the result is a set, and convert it to a list
        if isinstance(items, set):
            return list(items)
        # Check if the result is an integer, and convert it to a list with a single element
        elif isinstance(items, int):
            return [items]
        # For other types, return a list with the original value
        else:
            return [value]
    except (SyntaxError, ValueError):
        # Handle the case where literal_eval fails (e.g., invalid string)
        # Try to convert the value to an integer, and return a list with a single element if successful
        try:
            return [int(value)]
        except ValueError:
            return [value]

# Apply the function to the 'in_lines' column
df_stops_total['in_lines'] = df_stops_total['in_lines'].apply(convert_to_list)
df_1956['in_lines'] = df_1956['in_lines'].apply(convert_to_list)

In [137]:
df_stops_total['in_lines'] = df_stops_total['in_lines'].apply(lambda x: ','.join(map(str, x)))
df_1956['in_lines'] = df_1956['in_lines'].apply(lambda x: ','.join(map(str, x)))

In [138]:
df_stops_total.head(3)

Unnamed: 0,stop_id,stop_description,type,in_lines,stop_name,location,identifier
0,19601,4. Ring,bus,A53,4. Ring,"52.42250630689789, 13.2869490566549",
1,19602,Adamstrasse Ecke Pichelsdorfer Strasse,bus,"A35,A34",Adamstrasse,"52.521029117218866, 13.19780699419913",
2,19603,Berlin-Adlershof station,s-bahn,"KBS 103,KBS 100a",Adlershof,"52.434722222222,13.541388888889",Q323551


I have two dataframes df_long and df_short, I want to add the information from df_long columns combined_location and combined_identifier to df_short if there is a match between the original_name column from df_long with the stop_name column of df_short. I have to create combined_location and combined_identifier columns in df_short first

In [139]:
for index, row in df_1956.iterrows():
    # Check if row["stop_name"] is in df_1960["original_name"]
    if row["stop_name"] in df_stops_total["stop_name"].values:
        # Find matching rows in df_1960
        matching_rows = df_stops_total[df_stops_total["stop_name"] == row["stop_name"]]
        
        # Check if those two rows have matching row["type"] values
        matching_rows = matching_rows[matching_rows["type"] == row["type"]]
        
        # Check if there are any matching rows
        if not matching_rows.empty:
            df_1956.at[index, 'location'] = '+ '.join(matching_rows['location'].astype(str).values)
            df_1956.at[index, 'identifier'] = '+ '.join(matching_rows['identifier'].astype(str).values)
            df_1956.at[index, 'stop_description'] = '+ '.join(matching_rows['stop_description'].astype(str).values)
            df_1956.at[index, 'stop_id_other'] = '+ '.join(matching_rows['stop_id'].astype(str).values)
            df_1956.at[index, 'previous_in_lines'] = '+ '.join(matching_rows['in_lines'].astype(str).values)
        else:
            # Handle the case where there are no matching rows in df_1960
            df_1956.at[index, 'location'] = ''
            df_1956.at[index, 'identifier'] = ''
            df_1956.at[index, 'stop_description'] = ''
            df_1956.at[index, 'stop_id_other'] = ''
            df_1956.at[index, 'previous_in_lines'] = ''




        # # Convert values to strings and replace NaN with empty strings
        # combined_location = " + ".join(matching_rows["combined_location"].astype(str).fillna(''))
        # combined_identifier = " + ".join(matching_rows["combined_identifier"].astype(str).fillna(''))
            
        # # Update row["location"] and row["identifier"]
        # row["location"] += " + " + combined_location
        # row["identifier"] += " + " + combined_identifier

# Now, df_1961 should have updated "location" and "identifier" columns based on the conditions.


In [140]:
# Function to split string values containing '+' and convert to a unique string
def split_values(cell_value):
    unique_values = sorted(set(element.strip() for element in cell_value.split('+')))
    return '+'.join(unique_values)

# Apply the function to each cell in the specified columns
df_1956['location'] = df_1956['location'].apply(split_values)
df_1956['identifier'] = df_1956['identifier'].apply(split_values)

In [141]:
df_1956.drop(columns=['line_count', "stop_id_other"], inplace=True)

In [142]:
# Function to concatenate unique values in other columns
def concat_unique_values(series):
    unique_values = set(series)
    return '+ '.join(unique_values)

# Group by specified columns and aggregate the other columns
df_1956 = df_1956.groupby(['stop_name', 'type', 'in_lines', 'location']).agg({
    'identifier': concat_unique_values,
    'previous_in_lines': concat_unique_values,
    'stop_description': concat_unique_values
}).reset_index()

In [143]:
df_1956.to_csv('1956-stops-combined.csv')

In [144]:
df_1956.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1255 entries, 0 to 1254
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   stop_name          1255 non-null   object
 1   type               1255 non-null   object
 2   in_lines           1255 non-null   object
 3   location           1255 non-null   object
 4   identifier         1255 non-null   object
 5   previous_in_lines  1255 non-null   object
 6   stop_description   1255 non-null   object
dtypes: object(7)
memory usage: 68.8+ KB


In [145]:
df_1956.head(10)

Unnamed: 0,stop_name,type,in_lines,location,identifier,previous_in_lines,stop_description
0,Adamstrasse Ecke Wilhelmstrasse,bus,['A13'],,,,
1,Adlershof,S-Bahn,"['KBS 103', 'KBS 100a']",,,,
2,Adolf-Scheidt-Platz,strassenbahn,['95'],"52.478194856980814, 13.37836902238447",,95+ 96+ 96,Adolf-Scheidt-Platz+ Adolf-Scheidt-Platz+ Adol...
3,Afrikanische Str.,u-bahn,['C I'],"52.560027777778,13.334633333333",Q559239,"C I,C II+ C I+ C I",Afrikanische Straße+ Afrikanische Straße+ Afri...
4,"Ahrensfeld, Lichtenbergerstrasse",bus,['A46'],,,,
5,Alboinstrasse Ecke Schönebergerstrasse,strassenbahn,['6'],,,,
6,Albrechtshof,S-Bahn,['KBS 103'],,,,
7,Alexanderplatz,S-Bahn,"['KBS 102', 'KBS 103', 'KBS 101']",,,,
8,Alexanderplatz,u-bahn,"['A I', 'D', 'E']","52.5214,13.4119",Q17173458,"E,D,A I,A II+ A,E+ A,E",Alexanderplatz u-bahn station+ Alexanderplatz ...
9,"Alexanderplatz, Grunerstrasse",strassenbahn,"['71', '74', '72', '22']","52.52008326268696, 13.41408418961877",,"82,74,74E,72,71+ 71+ 71","Alexanderplatz, Grunerstrasse+ Alexanderplatz,..."
