In [1]:
import pandas as pd
import ast

In [4]:
df_stops_total = pd.read_csv('../final-tables/stops_table.csv')
df_1967 = pd.read_csv('stops_df_1967-initial.csv')

In [5]:
df_1967.head(3)

Unnamed: 0.1,Unnamed: 0,stop_name,line_count,type,in_lines
0,0,Revalerstrasse,2,strassenbahn,"['4', '3']"
1,1,Warschauer Strasse,3,strassenbahn,"['82', '3']"
2,2,Warschauer Strasse,3,bus,['O30']


In [6]:
df_1967.drop(columns=['Unnamed: 0'], inplace=True)
df_1967["location"] = ""
df_1967["identifier"] = ""
df_1967["stop_id_other"] = ""
df_1967["previous_in_lines"] = ""
df_1967["stop_description"] = ""

In [7]:
df_stops_total.tail(3)

Unnamed: 0,stop_id,stop_name,type,location,in_lines,identifier,stop_description
6423,19641453,Zum Heckeshorn,bus,"52.43307505069834, 13.164008736441371",A51,,Zum Heckeshorn Ecke Am Großen Wannsee
6424,19641454,Zweibrücker Strasse,bus,"52.54677409100436, 13.188764018875105",A63,,
6425,19641455,Zühlsdorf,bus,"52.7162250652868, 13.366925671584623",AS,,


In [8]:
# Function to convert string representation of set to list
def convert_to_list(value):
    try:
        # Use ast.literal_eval to safely convert string to set
        items = ast.literal_eval(value)
        
        # Check if the result is a set, and convert it to a list
        if isinstance(items, set):
            return list(items)
        # Check if the result is an integer, and convert it to a list with a single element
        elif isinstance(items, int):
            return [items]
        # For other types, return a list with the original value
        else:
            return [value]
    except (SyntaxError, ValueError):
        # Handle the case where literal_eval fails (e.g., invalid string)
        # Try to convert the value to an integer, and return a list with a single element if successful
        try:
            return [int(value)]
        except ValueError:
            return [value]

# Apply the function to the 'in_lines' column
df_stops_total['in_lines'] = df_stops_total['in_lines'].apply(convert_to_list)
df_1967['in_lines'] = df_1967['in_lines'].apply(convert_to_list)

In [9]:
df_stops_total['in_lines'] = df_stops_total['in_lines'].apply(lambda x: ','.join(map(str, x)))
df_1967['in_lines'] = df_1967['in_lines'].apply(lambda x: ','.join(map(str, x)))

In [10]:
df_stops_total.head(3)

Unnamed: 0,stop_id,stop_name,type,location,in_lines,identifier,stop_description
0,19510,Adamstrasse Ecke Wilhelmstrasse,bus,"52.5210974258045, 13.187990320264987",['A13'],,
1,19511,Adlershof,s-bahn,"52.434722222222,13.541388888889","['KBS 100a', 'KBS 103']",Q323551,
2,19512,"Adlershof, Oppenbrücke (Benzolwerk)",strassenbahn,"52.42607491619971, 13.555219462749426",['84'],,


I have two dataframes df_long and df_short, I want to add the information from df_long columns combined_location and combined_identifier to df_short if there is a match between the original_name column from df_long with the stop_name column of df_short. I have to create combined_location and combined_identifier columns in df_short first

In [11]:
for index, row in df_1967.iterrows():
    # Check if row["stop_name"] is in df_1960["original_name"]
    if row["stop_name"] in df_stops_total["stop_name"].values:
        # Find matching rows in df_1960
        matching_rows = df_stops_total[df_stops_total["stop_name"] == row["stop_name"]]
        
        # Check if those two rows have matching row["type"] values
        matching_rows = matching_rows[matching_rows["type"] == row["type"]]
        
        # Check if there are any matching rows
        if not matching_rows.empty:
            df_1967.at[index, 'location'] = '+ '.join(matching_rows['location'].astype(str).values)
            df_1967.at[index, 'identifier'] = '+ '.join(matching_rows['identifier'].astype(str).values)
            df_1967.at[index, 'stop_description'] = '+ '.join(matching_rows['stop_description'].astype(str).values)
            df_1967.at[index, 'stop_id_other'] = '+ '.join(matching_rows['stop_id'].astype(str).values)
            df_1967.at[index, 'previous_in_lines'] = '+ '.join(matching_rows['in_lines'].astype(str).values)
        else:
            # Handle the case where there are no matching rows in df_1960
            df_1967.at[index, 'location'] = ''
            df_1967.at[index, 'identifier'] = ''
            df_1967.at[index, 'stop_description'] = ''
            df_1967.at[index, 'stop_id_other'] = ''
            df_1967.at[index, 'previous_in_lines'] = ''




        # # Convert values to strings and replace NaN with empty strings
        # combined_location = " + ".join(matching_rows["combined_location"].astype(str).fillna(''))
        # combined_identifier = " + ".join(matching_rows["combined_identifier"].astype(str).fillna(''))
            
        # # Update row["location"] and row["identifier"]
        # row["location"] += " + " + combined_location
        # row["identifier"] += " + " + combined_identifier

# Now, df_1961 should have updated "location" and "identifier" columns based on the conditions.


In [12]:
# Function to split string values containing '+' and convert to a unique string
def split_values(cell_value):
    unique_values = sorted(set(element.strip() for element in cell_value.split('+')))
    return '+'.join(unique_values)

# Apply the function to each cell in the specified columns
df_1967['location'] = df_1967['location'].apply(split_values)
df_1967['identifier'] = df_1967['identifier'].apply(split_values)

In [13]:
df_1967.drop(columns=['line_count', "stop_id_other"], inplace=True)

In [14]:
# Function to concatenate unique values in other columns
def concat_unique_values(series):
    unique_values = set(series)
    return '+ '.join(unique_values)

# Group by specified columns and aggregate the other columns
df_1967 = df_1967.groupby(['stop_name', 'type', 'in_lines', 'location']).agg({
    'identifier': concat_unique_values,
    'previous_in_lines': concat_unique_values,
    'stop_description': concat_unique_values
}).reset_index()

In [15]:
df_1967.to_csv('1967-stops-combined.csv')

In [16]:
df_1967.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1375 entries, 0 to 1374
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   stop_name          1375 non-null   object
 1   type               1375 non-null   object
 2   in_lines           1375 non-null   object
 3   location           1375 non-null   object
 4   identifier         1375 non-null   object
 5   previous_in_lines  1375 non-null   object
 6   stop_description   1375 non-null   object
dtypes: object(7)
memory usage: 75.3+ KB


In [17]:
df_1967.head(10)

Unnamed: 0,stop_name,type,in_lines,location,identifier,previous_in_lines,stop_description
0,4. Ring,bus,['A53'],"52.42250630689789, 13.2869490566549",,A53+ A53+ A53,4. Ring+ 4. Ring+ 4. Ring
1,Aalemannufer Ecke Nieder-Neuendorfer Allee,bus,['A97'],,,,
2,Adalbertstrasse,bus,['A18'],"52.50548699996121, 13.42221404498021",,A1P,
3,Adamstrasse Ecke Pichelsdorferstrasse,bus,"['A34', 'A97']",,,,
4,Adamstrasse Ecke Wilhelmstrasse,bus,['A35'],"52.5210974258045, 13.187990320264987",,['A13']+ ['A13'],nan+ nan
5,Adenauerdamm (Messedamm),u-bahn,['1'],,,,
6,Adlershof,s-bahn,"['KBS 103', 'KBS 103a', 'KBS 106a', 'KBS 105']","52.434722222222,13.541388888889",Q323551,"['KBS 100a', 'KBS 103']+ ['KBS 100a', 'KBS 103...",nan+ nan+ Berlin-Adlershof station+ Berlin-Adl...
7,Afrikanische Str. (Friedrich-Ebert-Siedlung),u-bahn,['6'],,,,
8,Afrikanischestrasse Ecke Seestrasse,bus,['A89'],,,,
9,"Ahrensfeld, Kirche",bus,['A46'],"52.57532170199278, 13.576515606968998",,A46+ A46+ A46,"Ahrensfeld, Kirche+ Ahrensfeld, Kirche+ Ahrens..."
