In [7]:
import pandas as pd
import ast

In [8]:
df_stops_total = pd.read_csv('../final-tables/stops_table.csv')
df_1971 = pd.read_csv('stops_df_1971-initial.csv')

In [9]:
df_1971.head(3)

Unnamed: 0.1,Unnamed: 0,stop_name,line_count,type,in_lines
0,0,"Glienicke, Kirche",1,bus,['7']
1,1,Elsässer Strasse,1,bus,['7']
2,2,"Schildow, Kirche",1,bus,['7']


In [10]:
df_1971.drop(columns=['Unnamed: 0'], inplace=True)
df_1971["location"] = ""
df_1971["identifier"] = ""
df_1971["stop_id_other"] = ""
df_1971["previous_in_lines"] = ""
df_1971["stop_description"] = ""

In [11]:
df_stops_total.tail(3)

Unnamed: 0,stop_id,stop_name,type,location,in_lines,identifier,stop_description
10045,19761613,Zweiwinkelweg,bus,"52.532774381217585, 13.148393051743144",['80'],,
10046,19761614,Zwickauer Damm,bus,"52.42412841468055, 13.48547693484881",['41'],,
10047,19761615,zum Dorf Gatow,bus,"52.48199565682041, 13.166766484288239",['34E'],,


In [12]:
# Function to convert string representation of set to list
def convert_to_list(value):
    try:
        # Use ast.literal_eval to safely convert string to set
        items = ast.literal_eval(value)
        
        # Check if the result is a set, and convert it to a list
        if isinstance(items, set):
            return list(items)
        # Check if the result is an integer, and convert it to a list with a single element
        elif isinstance(items, int):
            return [items]
        # For other types, return a list with the original value
        else:
            return [value]
    except (SyntaxError, ValueError):
        # Handle the case where literal_eval fails (e.g., invalid string)
        # Try to convert the value to an integer, and return a list with a single element if successful
        try:
            return [int(value)]
        except ValueError:
            return [value]

# Apply the function to the 'in_lines' column
df_stops_total['in_lines'] = df_stops_total['in_lines'].apply(convert_to_list)
df_1971['in_lines'] = df_1971['in_lines'].apply(convert_to_list)

In [13]:
df_stops_total['in_lines'] = df_stops_total['in_lines'].apply(lambda x: ','.join(map(str, x)))
df_1971['in_lines'] = df_1971['in_lines'].apply(lambda x: ','.join(map(str, x)))

In [14]:
df_stops_total.head(3)

Unnamed: 0,stop_id,stop_name,type,location,in_lines,identifier,stop_description
0,19460,Adlershof,s-bahn,"52.434722222222,13.541388888889","['KBS 100a', 'KBS 100c']",Q323551,
1,19461,Akazienallee,bus,"52.47849809201335, 13.607179083668733",['D'],,
2,19462,Alexanderplatz,bus,"52.52005246558073, 13.413957243449447","['1', '9']",,


I have two dataframes df_long and df_short, I want to add the information from df_long columns combined_location and combined_identifier to df_short if there is a match between the original_name column from df_long with the stop_name column of df_short. I have to create combined_location and combined_identifier columns in df_short first

In [15]:
for index, row in df_1971.iterrows():
    # Check if row["stop_name"] is in df_1960["original_name"]
    if row["stop_name"] in df_stops_total["stop_name"].values:
        # Find matching rows in df_1960
        matching_rows = df_stops_total[df_stops_total["stop_name"] == row["stop_name"]]
        
        # Check if those two rows have matching row["type"] values
        matching_rows = matching_rows[matching_rows["type"] == row["type"]]
        
        # Check if there are any matching rows
        if not matching_rows.empty:
            df_1971.at[index, 'location'] = '+ '.join(matching_rows['location'].astype(str).values)
            df_1971.at[index, 'identifier'] = '+ '.join(matching_rows['identifier'].astype(str).values)
            df_1971.at[index, 'stop_description'] = '+ '.join(matching_rows['stop_description'].astype(str).values)
            df_1971.at[index, 'stop_id_other'] = '+ '.join(matching_rows['stop_id'].astype(str).values)
            df_1971.at[index, 'previous_in_lines'] = '+ '.join(matching_rows['in_lines'].astype(str).values)
        else:
            # Handle the case where there are no matching rows in df_1960
            df_1971.at[index, 'location'] = ''
            df_1971.at[index, 'identifier'] = ''
            df_1971.at[index, 'stop_description'] = ''
            df_1971.at[index, 'stop_id_other'] = ''
            df_1971.at[index, 'previous_in_lines'] = ''




        # # Convert values to strings and replace NaN with empty strings
        # combined_location = " + ".join(matching_rows["combined_location"].astype(str).fillna(''))
        # combined_identifier = " + ".join(matching_rows["combined_identifier"].astype(str).fillna(''))
            
        # # Update row["location"] and row["identifier"]
        # row["location"] += " + " + combined_location
        # row["identifier"] += " + " + combined_identifier

# Now, df_1961 should have updated "location" and "identifier" columns based on the conditions.


In [16]:
# Function to split string values containing '+' and convert to a unique string
def split_values(cell_value):
    unique_values = sorted(set(element.strip() for element in cell_value.split('+')))
    return '+'.join(unique_values)

# Apply the function to each cell in the specified columns
df_1971['location'] = df_1971['location'].apply(split_values)
df_1971['identifier'] = df_1971['identifier'].apply(split_values)

In [17]:
df_1971.drop(columns=['line_count', "stop_id_other"], inplace=True)

In [18]:
# Function to concatenate unique values in other columns
def concat_unique_values(series):
    unique_values = set(series)
    return '+ '.join(unique_values)

# Group by specified columns and aggregate the other columns
df_1971 = df_1971.groupby(['stop_name', 'type', 'in_lines', 'location']).agg({
    'identifier': concat_unique_values,
    'previous_in_lines': concat_unique_values,
    'stop_description': concat_unique_values
}).reset_index()

In [19]:
df_1971.to_csv('1971-stops-combined.csv')

In [20]:
df_1971.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1283 entries, 0 to 1282
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   stop_name          1283 non-null   object
 1   type               1283 non-null   object
 2   in_lines           1283 non-null   object
 3   location           1283 non-null   object
 4   identifier         1283 non-null   object
 5   previous_in_lines  1283 non-null   object
 6   stop_description   1283 non-null   object
dtypes: object(7)
memory usage: 70.3+ KB


In [21]:
df_1971.head(10)

Unnamed: 0,stop_name,type,in_lines,location,identifier,previous_in_lines,stop_description
0,4. Ring,bus,['A53'],"52.42250630689789, 13.2869490566549",,A53+ A53+ A53+ ['A53'],4. Ring+ 4. Ring+ 4. Ring+ nan
1,Aalemannufer,bus,['A97'],,,,
2,Adalbertstrasse,bus,"['18', '16']","52.50548699996121, 13.42221404498021",,"A1P+ ['A18']+ ['75', '78']",nan+ nan+ nan
3,Adamstrasse,bus,['A34'],"52.521029117218866, 13.19780699419913",,"A35,A34+ A34,A35+ A34,A35+ ['35', '97', '34']",Adamstrasse Ecke Pichelsdorfer Strasse+ Adamst...
4,Adamstrasse Ecke Földerichstrasse,bus,['A35'],,,,
5,Adlershof,s-bahn,"['KBS 140', 'KBS 110', 'KBS 141', 'KBS 115']","52.434722222222,13.541388888889",Q323551,"['KBS 100a', 'KBS 100c']+ ['KBS 100a', 'KBS 10...",nan+ nan+ nan+ Berlin-Adlershof station+ Berli...
6,Afrikanische Str. (Friedrich-Ebert-Siedlung),u-bahn,['6'],"52.560027777778,13.334633333333",Q559239,['6'],
7,Afrikanischestrasse Ecke Seestrasse,bus,['A89'],"52.54623694310499, 13.343930832086308",,['A89'],
8,"Ahrensfeld, Kirche",bus,['L/41'],"52.57532170199278, 13.576515606968998",,A46+ A46+ A46+ ['A46'],"Ahrensfeld, Kirche+ Ahrensfeld, Kirche+ Ahrens..."
9,Akademie der Wissenschaften,bus,['51/B'],"52.6361286884856, 13.501898618531815",,['51B']+ ['51'],nan+ nan
