In [1]:
import pandas as pd

# Simulating a broader dataset based on the provided sample structure
data = {
    'movie_id': [9, 9, 9, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 15],
    'cast_id': [162652153, 162652152, 418638213, 548155708, 358317901, 
                162652153, 162652152, 418638213, 162652153, 162652152, 548155708,
                162652152, 418638213, 162652152, 548155708, 162652153, 418638213, 162652153, 162652152, 418638213],
    'cast_name': ['Hayden Christensen', 'Ewan McGregor', 'Kenny Baker', 'Graeme Blundell', 'Jeremy Bulloch', 
                  'Hayden Christensen', 'Ewan McGregor', 'Kenny Baker', 'Hayden Christensen', 'Ewan McGregor', 'Graeme Blundell',
                  'Ewan McGregor', 'Kenny Baker', 'Ewan McGregor', 'Graeme Blundell', 'Hayden Christensen', 'Kenny Baker', 
                  'Hayden Christensen', 'Ewan McGregor', 'Kenny Baker']
}
cast_df = pd.DataFrame(data)

# Displaying the extended DataFrame for verification
cast_df.head(10)

Unnamed: 0,movie_id,cast_id,cast_name
0,9,162652153,Hayden Christensen
1,9,162652152,Ewan McGregor
2,9,418638213,Kenny Baker
3,9,548155708,Graeme Blundell
4,9,358317901,Jeremy Bulloch
5,10,162652153,Hayden Christensen
6,10,162652152,Ewan McGregor
7,10,418638213,Kenny Baker
8,11,162652153,Hayden Christensen
9,11,162652152,Ewan McGregor


In [5]:
# Group by 'col2' and aggregate 'col1' values into lists
df_pairs = df.groupby('col2')['col1'].agg(list).reset_index()

# Merge the aggregated pairs back into the original dataframe based on 'col2'
df_merged = df.merge(df_pairs, on='col2', suffixes=('', '_pairs'))

print(df_merged)

  col1  col2 col1_pairs
0    A     1     [A, B]
1    B     1     [A, B]
2    C     2  [C, D, E]
3    D     2  [C, D, E]
4    E     2  [C, D, E]
5    A     3     [A, D]
6    D     3     [A, D]


In [2]:
from itertools import combinations
from collections import Counter

# Finding all unique pairs of cast members for each movie
cast_pairs = cast_df.groupby('movie_id')['cast_name'].apply(lambda x: list(combinations(sorted(x), 2))).explode().tolist()

# Counting the number of movies each pair has appeared in together
pair_counts = Counter(cast_pairs)

# Filtering pairs that have appeared in 3 or more movies together
good_pairs = {pair: count for pair, count in pair_counts.items() if count >= 3}

# Creating the good_teamwork DataFrame
good_teamwork = pd.DataFrame(good_pairs.items(), columns=['cast_pair', 'num_movies'])

# Splitting the cast_pair tuple into separate columns
good_teamwork[['cast_member_1', 'cast_member_2']] = pd.DataFrame(good_teamwork['cast_pair'].tolist(), index=good_teamwork.index)
good_teamwork = good_teamwork.drop('cast_pair', axis=1)

# Reordering the columns as specified
good_teamwork = good_teamwork[['cast_member_1', 'cast_member_2', 'num_movies']]

# Sorting the DataFrame
good_teamwork = good_teamwork.sort_values(by=['cast_member_1', 'cast_member_2']).reset_index(drop=True)

good_teamwork


Unnamed: 0,cast_member_1,cast_member_2,num_movies
0,Ewan McGregor,Graeme Blundell,3
1,Ewan McGregor,Hayden Christensen,4
2,Ewan McGregor,Kenny Baker,4
3,Hayden Christensen,Kenny Baker,4
