In [2]:
import pandas as pd
from collections import Counter
import json
import os

In [3]:
# Assign path variable
path = '/Users/mariamaske/Star Trek Analysis/Data/Cleaned Data'

## TAS

In [4]:
def extract_and_filter_interactions(json_file_path, specific_series, lines_threshold=2):
    # Read data from the JSON file
    with open(json_file_path, 'r') as file:
        json_data = json.load(file)
    # Extract interactions with series and episode information
    interactions = []
    for series_key, series_data in json_data.items():
        if series_key == specific_series:
            for character, dialogues in series_data.items():
                # Ensure there are at least two dialogues for each character
                if len(dialogues) >= lines_threshold:
                    # Convert dialogues to strings
                    dialogues = [str(dialogue) for dialogue in dialogues]

                    # Create pairs of consecutive dialogues with series information
                    for i in range(len(dialogues) - 1):
                        # Sort the dialogues to ensure consistent order
                        sorted_dialogues = sorted([dialogues[i], dialogues[i + 1]])

                        interaction = (series_key, sorted_dialogues[0], sorted_dialogues[1])
                        interactions.append(interaction)

    # Count occurrences of interactions
    interaction_counts = Counter(interactions)

    # Convert the result to a DataFrame
    df = pd.DataFrame(list(interaction_counts.items()), columns=['Interaction', 'Count'])

    # Split the 'Interaction' column into 'Series', 'Dialogue1', and 'Dialogue2'
    df[['Series', 'Dialogue1', 'Dialogue2']] = pd.DataFrame(df['Interaction'].tolist(), index=df.index)

    # Drop the original 'Interaction' column
    df.drop(columns=['Interaction'], inplace=True)

    # Reorder columns
    df = df[['Series', 'Dialogue1', 'Dialogue2', 'Count']]

    # Group by the combined dialogues and sum the counts
    df_combined = df.groupby(['Series', 'Dialogue1', 'Dialogue2']).agg({'Count': 'sum'}).reset_index()

    # Sort by series, interaction, and count in descending order
    df_combined.sort_values(by=['Series', 'Count'], ascending=[True, False], inplace=True)

    # Filter the DataFrame for a specific series
    filtered_df = df_combined.loc[df_combined['Series'] == specific_series]

    return filtered_df

# Example usage with lines_threshold set to 3
json_file_path = '/Users/mariamaske/Star Trek Analysis/Data/StarTrekDialogue.json'
specific_series = 'TAS'
lines_threshold = 5
result_df = extract_and_filter_interactions(json_file_path, specific_series, lines_threshold)

# Print the final result
print(result_df)

    Series               Dialogue1 Dialogue2  Count
109    TAS                    KIRK     SPOCK     13
156    TAS                   SPOCK      SULU      7
162    TAS                    SULU     UHURA      7
138    TAS                   MCCOY     SPOCK      6
106    TAS                    KIRK     MCCOY      4
..     ...                     ...       ...    ...
155    TAS                    SORD     SPOCK      1
157    TAS                   SPOCK     TCHAR      1
158    TAS                   SPOCK    THELIN      1
160    TAS  SPOCK: Ready, Captain.    WESLEY      1
161    TAS                  STAVOS     UHURA      1

[163 rows x 4 columns]


In [5]:
# Replace 'Dialogue1', 'Dialogue2', and 'Count' with the actual column names in your dataset
allowed_dialogue1_values = ['KIRK', 'SPOCK', 'MCCOY', 'CHEKOV', 'SCOTT', 'SULU', 'UHURA', 'CHAPEL']  # Replace with the specific values you're interested in for 'Dialogue1'
allowed_dialogue2_values = ['KIRK', 'SPOCK', 'MCCOY', 'CHEKOV', 'SCOTT', 'SULU', 'UHURA', 'CHAPEL']  # Replace with the specific values you're interested in for 'Dialogue2'

filtered_df = result_df[
    (result_df['Dialogue1'].isin(allowed_dialogue1_values)) & 
    (result_df['Dialogue2'].isin(allowed_dialogue2_values)) & 
    (result_df['Count'] >= 1)
]

In [6]:
filtered_df.head(50)

Unnamed: 0,Series,Dialogue1,Dialogue2,Count
109,TAS,KIRK,SPOCK,13
156,TAS,SPOCK,SULU,7
162,TAS,SULU,UHURA,7
138,TAS,MCCOY,SPOCK,6
106,TAS,KIRK,MCCOY,4
108,TAS,KIRK,SCOTT,4
137,TAS,MCCOY,SCOTT,4
150,TAS,SCOTT,SULU,4
152,TAS,SCOTT,UHURA,4
159,TAS,SPOCK,UHURA,4


In [7]:
# exporting Star trek final data
filtered_df.to_csv(os.path.join(path,'StarTrek_interaction_maincast_tas.csv'))

In [8]:
import pandas as pd

df = filtered_df

# Replace 'Dialogue1', 'Dialogue2', and 'Count' with the actual column names in your dataset

# Create a list of all characters from both 'Dialogue1' and 'Dialogue2'
all_characters = pd.concat([df['Dialogue1'], df['Dialogue2']]).unique()

# Create a DataFrame with all characters
all_characters_df = pd.DataFrame({'Character': all_characters})

# Merge the DataFrame with counts for 'Dialogue1'
all_characters_df = pd.merge(all_characters_df, df.groupby('Dialogue1')['Count'].sum().reset_index(), how='left', left_on='Character', right_on='Dialogue1').rename(columns={'Count': 'Count_Dialogue1'})

# Merge the DataFrame with counts for 'Dialogue2'
all_characters_df = pd.merge(all_characters_df, df.groupby('Dialogue2')['Count'].sum().reset_index(), how='left', left_on='Character', right_on='Dialogue2').rename(columns={'Count': 'Count_Dialogue2'})

# Fill NaN values with 0
all_characters_df['Count_Dialogue1'] = all_characters_df['Count_Dialogue1'].fillna(0)
all_characters_df['Count_Dialogue2'] = all_characters_df['Count_Dialogue2'].fillna(0)

# Calculate the total count
all_characters_df['Total_Count'] = all_characters_df['Count_Dialogue1'] + all_characters_df['Count_Dialogue2']

# Drop unnecessary columns
all_characters_df = all_characters_df[['Character', 'Total_Count']]

# Now, all_characters_df contains a list of characters and their total counts from both 'Dialogue1' and 'Dialogue2'
print(all_characters_df)

  Character  Total_Count
0      KIRK         23.0
1     SPOCK         33.0
2      SULU         20.0
3     MCCOY         20.0
4     SCOTT         22.0
5    CHAPEL          4.0
6     UHURA         20.0


In [9]:
# exporting Star trek tos total count data
all_characters_df.to_csv(os.path.join(path,'StarTrek_total_count_interaction_maincast_tas.csv'))

## TNG

In [10]:
def extract_and_filter_interactions(json_file_path, specific_series, lines_threshold=2):
    # Read data from the JSON file
    with open(json_file_path, 'r') as file:
        json_data = json.load(file)
    # Extract interactions with series and episode information
    interactions = []
    for series_key, series_data in json_data.items():
        if series_key == specific_series:
            for character, dialogues in series_data.items():
                # Ensure there are at least two dialogues for each character
                if len(dialogues) >= lines_threshold:
                    # Convert dialogues to strings
                    dialogues = [str(dialogue) for dialogue in dialogues]

                    # Create pairs of consecutive dialogues with series information
                    for i in range(len(dialogues) - 1):
                        # Sort the dialogues to ensure consistent order
                        sorted_dialogues = sorted([dialogues[i], dialogues[i + 1]])

                        interaction = (series_key, sorted_dialogues[0], sorted_dialogues[1])
                        interactions.append(interaction)

    # Count occurrences of interactions
    interaction_counts = Counter(interactions)

    # Convert the result to a DataFrame
    df = pd.DataFrame(list(interaction_counts.items()), columns=['Interaction', 'Count'])

    # Split the 'Interaction' column into 'Series', 'Dialogue1', and 'Dialogue2'
    df[['Series', 'Dialogue1', 'Dialogue2']] = pd.DataFrame(df['Interaction'].tolist(), index=df.index)

    # Drop the original 'Interaction' column
    df.drop(columns=['Interaction'], inplace=True)

    # Reorder columns
    df = df[['Series', 'Dialogue1', 'Dialogue2', 'Count']]

    # Group by the combined dialogues and sum the counts
    df_combined = df.groupby(['Series', 'Dialogue1', 'Dialogue2']).agg({'Count': 'sum'}).reset_index()

    # Sort by series, interaction, and count in descending order
    df_combined.sort_values(by=['Series', 'Count'], ascending=[True, False], inplace=True)

    # Filter the DataFrame for a specific series
    filtered_df = df_combined.loc[df_combined['Series'] == specific_series]

    return filtered_df

# Example usage with lines_threshold set to 3
json_file_path = '/Users/mariamaske/Star Trek Analysis/Data/StarTrekDialogue.json'
specific_series = 'TNG'
lines_threshold = 5
result_df = extract_and_filter_interactions(json_file_path, specific_series, lines_threshold)

# Print the final result
print(result_df)

     Series                              Dialogue1  \
1376    TNG                                 PICARD   
1472    TNG                                  RIKER   
683     TNG                                   DATA   
677     TNG                                   DATA   
658     TNG                                   DATA   
...     ...                                    ...   
1583    TNG                                 WESLEY   
1584    TNG  WESLEY: No! He's very weak. TRAVELLER   
1585    TNG                                  WOMAN   
1587    TNG                                   WORF   
1588    TNG                              YOUNG MAN   

                                              Dialogue2  Count  
1376                                              RIKER     57  
1472                                               WORF     52  
683                                               RIKER     48  
677                                              PICARD     47  
658                       

In [11]:
# Replace 'Dialogue1', 'Dialogue2', and 'Count' with the actual column names in your dataset
allowed_dialogue1_values = ['PICARD', 'RIKER', 'DATA', 'WORF', 'TROI', 'CRUSHER', 'LAFORGE', 'WESLEY']  # Replace with the specific values you're interested in for 'Dialogue1'
allowed_dialogue2_values = ['PICARD', 'RIKER', 'DATA', 'WORF', 'TROI', 'CRUSHER', 'LAFORGE', 'WESLEY']  # Replace with the specific values you're interested in for 'Dialogue2'

filtered_df = result_df[
    (result_df['Dialogue1'].isin(allowed_dialogue1_values)) & 
    (result_df['Dialogue2'].isin(allowed_dialogue2_values)) & 
    (result_df['Count'] >= 1)
]

In [12]:
filtered_df.head(50)

Unnamed: 0,Series,Dialogue1,Dialogue2,Count
1376,TNG,PICARD,RIKER,57
1472,TNG,RIKER,WORF,52
683,TNG,DATA,RIKER,48
677,TNG,DATA,PICARD,47
658,TNG,DATA,LAFORGE,42
1395,TNG,PICARD,WORF,34
699,TNG,DATA,WORF,30
1121,TNG,LAFORGE,RIKER,30
607,TNG,CRUSHER,TROI,27
1117,TNG,LAFORGE,PICARD,27


In [13]:
# exporting Star trek final data
filtered_df.to_csv(os.path.join(path,'StarTrek_interaction_maincast_tng.csv'))

In [14]:
df = filtered_df

# Replace 'Dialogue1', 'Dialogue2', and 'Count' with the actual column names in your dataset

# Create a list of all characters from both 'Dialogue1' and 'Dialogue2'
all_characters = pd.concat([df['Dialogue1'], df['Dialogue2']]).unique()

# Create a DataFrame with all characters
all_characters_df = pd.DataFrame({'Character': all_characters})

# Merge the DataFrame with counts for 'Dialogue1'
all_characters_df = pd.merge(all_characters_df, df.groupby('Dialogue1')['Count'].sum().reset_index(), how='left', left_on='Character', right_on='Dialogue1').rename(columns={'Count': 'Count_Dialogue1'})

# Merge the DataFrame with counts for 'Dialogue2'
all_characters_df = pd.merge(all_characters_df, df.groupby('Dialogue2')['Count'].sum().reset_index(), how='left', left_on='Character', right_on='Dialogue2').rename(columns={'Count': 'Count_Dialogue2'})

# Fill NaN values with 0
all_characters_df['Count_Dialogue1'] = all_characters_df['Count_Dialogue1'].fillna(0)
all_characters_df['Count_Dialogue2'] = all_characters_df['Count_Dialogue2'].fillna(0)

# Calculate the total count
all_characters_df['Total_Count'] = all_characters_df['Count_Dialogue1'] + all_characters_df['Count_Dialogue2']

# Drop unnecessary columns
all_characters_df = all_characters_df[['Character', 'Total_Count']]

# Now, all_characters_df contains a list of characters and their total counts from both 'Dialogue1' and 'Dialogue2'
print(all_characters_df)

  Character  Total_Count
0    PICARD        214.0
1     RIKER        236.0
2      DATA        227.0
3   LAFORGE        167.0
4   CRUSHER        123.0
5      TROI        146.0
6    WESLEY         79.0
7      WORF        194.0


In [15]:
# exporting Star trek tos total count data
all_characters_df.to_csv(os.path.join(path,'StarTrek_total_count_interaction_maincast_tng.csv'))

## DS9

In [16]:
def extract_and_filter_interactions(json_file_path, specific_series, lines_threshold=2):
    # Read data from the JSON file
    with open(json_file_path, 'r') as file:
        json_data = json.load(file)
    # Extract interactions with series and episode information
    interactions = []
    for series_key, series_data in json_data.items():
        if series_key == specific_series:
            for character, dialogues in series_data.items():
                # Ensure there are at least two dialogues for each character
                if len(dialogues) >= lines_threshold:
                    # Convert dialogues to strings
                    dialogues = [str(dialogue) for dialogue in dialogues]

                    # Create pairs of consecutive dialogues with series information
                    for i in range(len(dialogues) - 1):
                        # Sort the dialogues to ensure consistent order
                        sorted_dialogues = sorted([dialogues[i], dialogues[i + 1]])

                        interaction = (series_key, sorted_dialogues[0], sorted_dialogues[1])
                        interactions.append(interaction)

    # Count occurrences of interactions
    interaction_counts = Counter(interactions)

    # Convert the result to a DataFrame
    df = pd.DataFrame(list(interaction_counts.items()), columns=['Interaction', 'Count'])

    # Split the 'Interaction' column into 'Series', 'Dialogue1', and 'Dialogue2'
    df[['Series', 'Dialogue1', 'Dialogue2']] = pd.DataFrame(df['Interaction'].tolist(), index=df.index)

    # Drop the original 'Interaction' column
    df.drop(columns=['Interaction'], inplace=True)

    # Reorder columns
    df = df[['Series', 'Dialogue1', 'Dialogue2', 'Count']]

    # Group by the combined dialogues and sum the counts
    df_combined = df.groupby(['Series', 'Dialogue1', 'Dialogue2']).agg({'Count': 'sum'}).reset_index()

    # Sort by series, interaction, and count in descending order
    df_combined.sort_values(by=['Series', 'Count'], ascending=[True, False], inplace=True)

    # Filter the DataFrame for a specific series
    filtered_df = df_combined.loc[df_combined['Series'] == specific_series]

    return filtered_df

# Example usage with lines_threshold set to 3
json_file_path = '/Users/mariamaske/Star Trek Analysis/Data/StarTrekDialogue.json'
specific_series = 'DS9'
lines_threshold = 5
result_df = extract_and_filter_interactions(json_file_path, specific_series, lines_threshold)

# Print the final result
print(result_df)

     Series Dialogue1      Dialogue2  Count
1123    DS9      KIRA          SISKO     50
204     DS9    BASHIR        O'BRIEN     45
615     DS9       DAX           KIRA     45
1399    DS9       ODO          QUARK     41
1109    DS9      KIRA        O'BRIEN     33
...     ...       ...            ...    ...
1605    DS9    WEYOUN           WORF      1
1606    DS9   WHATLEY           WORF      1
1607    DS9    WILLIE          WOMAN      1
1608    DS9     WOMAN        WOMAN 2      1
1609    DS9      WORF  WORF + HURAGA      1

[1611 rows x 4 columns]


In [17]:
# Replace 'Dialogue1', 'Dialogue2', and 'Count' with the actual column names in your dataset
allowed_dialogue1_values = ['SISKO', 'ODO', 'DAX', 'KIRA', "O'BRIEN", 'QUARK', 'BASHIR', 'JAKE', 'WORF', 'EZRI']  # Replace with the specific values you're interested in for 'Dialogue1'
allowed_dialogue2_values = ['SISKO', 'ODO', 'DAX', 'KIRA', "O'BRIEN", 'QUARK', 'BASHIR', 'JAKE', 'WORF', 'EZRI']  # Replace with the specific values you're interested in for 'Dialogue2'

filtered_df = result_df[
    (result_df['Dialogue1'].isin(allowed_dialogue1_values)) & 
    (result_df['Dialogue2'].isin(allowed_dialogue2_values)) & 
    (result_df['Count'] >= 5)
]

In [18]:
filtered_df.head(50)

Unnamed: 0,Series,Dialogue1,Dialogue2,Count
1123,DS9,KIRA,SISKO,50
204,DS9,BASHIR,O'BRIEN,45
615,DS9,DAX,KIRA,45
1399,DS9,ODO,QUARK,41
1109,DS9,KIRA,O'BRIEN,33
645,DS9,DAX,SISKO,31
205,DS9,BASHIR,ODO,30
630,DS9,DAX,O'BRIEN,29
1371,DS9,O'BRIEN,SISKO,29
156,DS9,BASHIR,DAX,28


In [19]:
# exporting Star trek final data
filtered_df.to_csv(os.path.join(path,'StarTrek_interaction_maincast_ds9.csv'))

In [20]:
df = filtered_df

# Replace 'Dialogue1', 'Dialogue2', and 'Count' with the actual column names in your dataset

# Create a list of all characters from both 'Dialogue1' and 'Dialogue2'
all_characters = pd.concat([df['Dialogue1'], df['Dialogue2']]).unique()

# Create a DataFrame with all characters
all_characters_df = pd.DataFrame({'Character': all_characters})

# Merge the DataFrame with counts for 'Dialogue1'
all_characters_df = pd.merge(all_characters_df, df.groupby('Dialogue1')['Count'].sum().reset_index(), how='left', left_on='Character', right_on='Dialogue1').rename(columns={'Count': 'Count_Dialogue1'})

# Merge the DataFrame with counts for 'Dialogue2'
all_characters_df = pd.merge(all_characters_df, df.groupby('Dialogue2')['Count'].sum().reset_index(), how='left', left_on='Character', right_on='Dialogue2').rename(columns={'Count': 'Count_Dialogue2'})

# Fill NaN values with 0
all_characters_df['Count_Dialogue1'] = all_characters_df['Count_Dialogue1'].fillna(0)
all_characters_df['Count_Dialogue2'] = all_characters_df['Count_Dialogue2'].fillna(0)

# Calculate the total count
all_characters_df['Total_Count'] = all_characters_df['Count_Dialogue1'] + all_characters_df['Count_Dialogue2']

# Drop unnecessary columns
all_characters_df = all_characters_df[['Character', 'Total_Count']]

# Now, all_characters_df contains a list of characters and their total counts from both 'Dialogue1' and 'Dialogue2'
print(all_characters_df)

  Character  Total_Count
0      KIRA        224.0
1    BASHIR        190.0
2       DAX        195.0
3       ODO        177.0
4   O'BRIEN        197.0
5      JAKE         61.0
6     SISKO        188.0
7     QUARK        139.0
8      EZRI         10.0
9      WORF        111.0


In [21]:
# exporting Star trek tos total count data
all_characters_df.to_csv(os.path.join(path,'StarTrek_total_count_interaction_maincast_ds9.csv'))

## ENT

In [22]:
def extract_and_filter_interactions(json_file_path, specific_series, lines_threshold=2):
    # Read data from the JSON file
    with open(json_file_path, 'r') as file:
        json_data = json.load(file)
    # Extract interactions with series and episode information
    interactions = []
    for series_key, series_data in json_data.items():
        if series_key == specific_series:
            for character, dialogues in series_data.items():
                # Ensure there are at least two dialogues for each character
                if len(dialogues) >= lines_threshold:
                    # Convert dialogues to strings
                    dialogues = [str(dialogue) for dialogue in dialogues]

                    # Create pairs of consecutive dialogues with series information
                    for i in range(len(dialogues) - 1):
                        # Sort the dialogues to ensure consistent order
                        sorted_dialogues = sorted([dialogues[i], dialogues[i + 1]])

                        interaction = (series_key, sorted_dialogues[0], sorted_dialogues[1])
                        interactions.append(interaction)

    # Count occurrences of interactions
    interaction_counts = Counter(interactions)

    # Convert the result to a DataFrame
    df = pd.DataFrame(list(interaction_counts.items()), columns=['Interaction', 'Count'])

    # Split the 'Interaction' column into 'Series', 'Dialogue1', and 'Dialogue2'
    df[['Series', 'Dialogue1', 'Dialogue2']] = pd.DataFrame(df['Interaction'].tolist(), index=df.index)

    # Drop the original 'Interaction' column
    df.drop(columns=['Interaction'], inplace=True)

    # Reorder columns
    df = df[['Series', 'Dialogue1', 'Dialogue2', 'Count']]

    # Group by the combined dialogues and sum the counts
    df_combined = df.groupby(['Series', 'Dialogue1', 'Dialogue2']).agg({'Count': 'sum'}).reset_index()

    # Sort by series, interaction, and count in descending order
    df_combined.sort_values(by=['Series', 'Count'], ascending=[True, False], inplace=True)

    # Filter the DataFrame for a specific series
    filtered_df = df_combined.loc[df_combined['Series'] == specific_series]

    return filtered_df

# Example usage with lines_threshold set to 3
json_file_path = '/Users/mariamaske/Star Trek Analysis/Data/StarTrekDialogue.json'
specific_series = 'ENT'
lines_threshold = 5
result_df = extract_and_filter_interactions(json_file_path, specific_series, lines_threshold)

# Print the final result
print(result_df)

    Series Dialogue1                                          Dialogue2  Count
108    ENT    ARCHER                                              T'POL     40
498    ENT     HOSHI                                               REED     32
113    ENT    ARCHER                                             TUCKER     31
766    ENT      REED                                             TRAVIS     31
831    ENT     T'POL                                             TUCKER     25
..     ...       ...                                                ...    ...
864    ENT   VALDORE                                               VRAX      1
866    ENT    VOICES                                          YOUNG MAN      1
867    ENT      VOSK  VOSK: You are a fool. You think we're equals b...      1
868    ENT     YOLEN                                              ZEPHT      1
869    ENT  ZHO'KAAN                                             ZSHAAR      1

[870 rows x 4 columns]


In [23]:
# Replace 'Dialogue1', 'Dialogue2', and 'Count' with the actual column names in your dataset
allowed_dialogue1_values = ['ARCHER', "T'POL", 'PHLOX', 'REES', "TRAVIS", 'HOSHI', 'TUCKER']  # Replace with the specific values you're interested in for 'Dialogue1'
allowed_dialogue2_values = ['ARCHER', "T'POL", 'PHLOX', 'REES', "TRAVIS", 'HOSHI', 'TUCKER']  # Replace with the specific values you're interested in for 'Dialogue2'

filtered_df = result_df[
    (result_df['Dialogue1'].isin(allowed_dialogue1_values)) & 
    (result_df['Dialogue2'].isin(allowed_dialogue2_values)) & 
    (result_df['Count'] >= 5)
]

In [24]:
filtered_df.head(50)

Unnamed: 0,Series,Dialogue1,Dialogue2,Count
108,ENT,ARCHER,T'POL,40
113,ENT,ARCHER,TUCKER,31
831,ENT,T'POL,TUCKER,25
513,ENT,HOSHI,TRAVIS,21
721,ENT,PHLOX,TRAVIS,19
714,ENT,PHLOX,T'POL,16
509,ENT,HOSHI,T'POL,15
516,ENT,HOSHI,TUCKER,15
723,ENT,PHLOX,TUCKER,15
850,ENT,TRAVIS,TUCKER,15


In [25]:
# exporting Star trek final data
filtered_df.to_csv(os.path.join(path,'StarTrek_interaction_maincast_ent.csv'))

In [26]:
df = filtered_df

# Replace 'Dialogue1', 'Dialogue2', and 'Count' with the actual column names in your dataset

# Create a list of all characters from both 'Dialogue1' and 'Dialogue2'
all_characters = pd.concat([df['Dialogue1'], df['Dialogue2']]).unique()

# Create a DataFrame with all characters
all_characters_df = pd.DataFrame({'Character': all_characters})

# Merge the DataFrame with counts for 'Dialogue1'
all_characters_df = pd.merge(all_characters_df, df.groupby('Dialogue1')['Count'].sum().reset_index(), how='left', left_on='Character', right_on='Dialogue1').rename(columns={'Count': 'Count_Dialogue1'})

# Merge the DataFrame with counts for 'Dialogue2'
all_characters_df = pd.merge(all_characters_df, df.groupby('Dialogue2')['Count'].sum().reset_index(), how='left', left_on='Character', right_on='Dialogue2').rename(columns={'Count': 'Count_Dialogue2'})

# Fill NaN values with 0
all_characters_df['Count_Dialogue1'] = all_characters_df['Count_Dialogue1'].fillna(0)
all_characters_df['Count_Dialogue2'] = all_characters_df['Count_Dialogue2'].fillna(0)

# Calculate the total count
all_characters_df['Total_Count'] = all_characters_df['Count_Dialogue1'] + all_characters_df['Count_Dialogue2']

# Drop unnecessary columns
all_characters_df = all_characters_df[['Character', 'Total_Count']]

# Now, all_characters_df contains a list of characters and their total counts from both 'Dialogue1' and 'Dialogue2'
print(all_characters_df)

  Character  Total_Count
0    ARCHER         97.0
1     T'POL        110.0
2     HOSHI         79.0
3     PHLOX         71.0
4    TRAVIS         74.0
5    TUCKER        101.0


In [27]:
# exporting Star trek tos total count data
all_characters_df.to_csv(os.path.join(path,'StarTrek_total_count_interaction_maincast_ent.csv'))

## DIS

In [28]:
def extract_and_filter_interactions(json_file_path, specific_series, lines_threshold=2):
    # Read data from the JSON file
    with open(json_file_path, 'r') as file:
        json_data = json.load(file)
    # Extract interactions with series and episode information
    interactions = []
    for series_key, series_data in json_data.items():
        if series_key == specific_series:
            for character, dialogues in series_data.items():
                # Ensure there are at least two dialogues for each character
                if len(dialogues) >= lines_threshold:
                    # Convert dialogues to strings
                    dialogues = [str(dialogue) for dialogue in dialogues]

                    # Create pairs of consecutive dialogues with series information
                    for i in range(len(dialogues) - 1):
                        # Sort the dialogues to ensure consistent order
                        sorted_dialogues = sorted([dialogues[i], dialogues[i + 1]])

                        interaction = (series_key, sorted_dialogues[0], sorted_dialogues[1])
                        interactions.append(interaction)

    # Count occurrences of interactions
    interaction_counts = Counter(interactions)

    # Convert the result to a DataFrame
    df = pd.DataFrame(list(interaction_counts.items()), columns=['Interaction', 'Count'])

    # Split the 'Interaction' column into 'Series', 'Dialogue1', and 'Dialogue2'
    df[['Series', 'Dialogue1', 'Dialogue2']] = pd.DataFrame(df['Interaction'].tolist(), index=df.index)

    # Drop the original 'Interaction' column
    df.drop(columns=['Interaction'], inplace=True)

    # Reorder columns
    df = df[['Series', 'Dialogue1', 'Dialogue2', 'Count']]

    # Group by the combined dialogues and sum the counts
    df_combined = df.groupby(['Series', 'Dialogue1', 'Dialogue2']).agg({'Count': 'sum'}).reset_index()

    # Sort by series, interaction, and count in descending order
    df_combined.sort_values(by=['Series', 'Count'], ascending=[True, False], inplace=True)

    # Filter the DataFrame for a specific series
    filtered_df = df_combined.loc[df_combined['Series'] == specific_series]

    return filtered_df

# Example usage with lines_threshold set to 3
json_file_path = '/Users/mariamaske/Star Trek Analysis/Data/StarTrekDialogue.json'
specific_series = 'DIS'
lines_threshold = 5
result_df = extract_and_filter_interactions(json_file_path, specific_series, lines_threshold)

# Print the final result
print(result_df)

    Series          Dialogue1      Dialogue2  Count
360    DIS             DETMER       OWOSEKUN     13
152    DIS              BRYCE       OWOSEKUN      9
144    DIS              BRYCE         DETMER      8
710    DIS            STAMETS          TILLY      8
201    DIS            BURNHAM           SARU      7
..     ...                ...            ...    ...
726    DIS  TRANSPORTER CHIEF          TYLER      1
727    DIS  TRANSPORTER CHIEF          WOMAN      1
728    DIS              TYLER      VOQ-TYLER      1
729    DIS              VANCE          WILLA      1
730    DIS                VOQ  YOUNG T'KUVMA      1

[731 rows x 4 columns]


In [29]:
# Replace 'Dialogue1', 'Dialogue2', and 'Count' with the actual column names in your dataset
allowed_dialogue1_values = ['BURNHAM', "SARU", 'TILLY', 'LORCA', "CULBER", 'PIKE', 'NHAN', 'STAMETS', 'TYLER']  # Replace with the specific values you're interested in for 'Dialogue1'
allowed_dialogue2_values = ['BURNHAM', "SARU", 'TILLY', 'LORCA', "CULBER", 'PIKE', 'NHAN', 'STAMETS', 'TYLER']  # Replace with the specific values you're interested in for 'Dialogue2'

filtered_df = result_df[
    (result_df['Dialogue1'].isin(allowed_dialogue1_values)) & 
    (result_df['Dialogue2'].isin(allowed_dialogue2_values)) & 
    (result_df['Count'] >= 2)
]

In [30]:
filtered_df.head(50)

Unnamed: 0,Series,Dialogue1,Dialogue2,Count
710,DIS,STAMETS,TILLY,8
201,DIS,BURNHAM,SARU,7
206,DIS,BURNHAM,TILLY,7
688,DIS,SARU,TILLY,7
685,DIS,SARU,STAMETS,5
197,DIS,BURNHAM,PIKE,4
337,DIS,CULBER,STAMETS,4
690,DIS,SARU,TYLER,4
203,DIS,BURNHAM,STAMETS,3
334,DIS,CULBER,SARU,3


In [31]:
# exporting Star trek final data
filtered_df.to_csv(os.path.join(path,'StarTrek_interaction_maincast_dis.csv'))

In [32]:
df = filtered_df

# Replace 'Dialogue1', 'Dialogue2', and 'Count' with the actual column names in your dataset

# Create a list of all characters from both 'Dialogue1' and 'Dialogue2'
all_characters = pd.concat([df['Dialogue1'], df['Dialogue2']]).unique()

# Create a DataFrame with all characters
all_characters_df = pd.DataFrame({'Character': all_characters})

# Merge the DataFrame with counts for 'Dialogue1'
all_characters_df = pd.merge(all_characters_df, df.groupby('Dialogue1')['Count'].sum().reset_index(), how='left', left_on='Character', right_on='Dialogue1').rename(columns={'Count': 'Count_Dialogue1'})

# Merge the DataFrame with counts for 'Dialogue2'
all_characters_df = pd.merge(all_characters_df, df.groupby('Dialogue2')['Count'].sum().reset_index(), how='left', left_on='Character', right_on='Dialogue2').rename(columns={'Count': 'Count_Dialogue2'})

# Fill NaN values with 0
all_characters_df['Count_Dialogue1'] = all_characters_df['Count_Dialogue1'].fillna(0)
all_characters_df['Count_Dialogue2'] = all_characters_df['Count_Dialogue2'].fillna(0)

# Calculate the total count
all_characters_df['Total_Count'] = all_characters_df['Count_Dialogue1'] + all_characters_df['Count_Dialogue2']

# Drop unnecessary columns
all_characters_df = all_characters_df[['Character', 'Total_Count']]

# Now, all_characters_df contains a list of characters and their total counts from both 'Dialogue1' and 'Dialogue2'
print(all_characters_df)

  Character  Total_Count
0   STAMETS         20.0
1   BURNHAM         25.0
2      SARU         31.0
3    CULBER         14.0
4      PIKE         12.0
5     LORCA          4.0
6     TILLY         31.0
7     TYLER         13.0


In [33]:
# exporting Star trek tos total count data
all_characters_df.to_csv(os.path.join(path,'StarTrek_total_count_interaction_maincast_dis.csv'))

## PIC

In [34]:
def extract_and_filter_interactions(json_file_path, specific_series, lines_threshold=2):
    # Read data from the JSON file
    with open(json_file_path, 'r') as file:
        json_data = json.load(file)
    # Extract interactions with series and episode information
    interactions = []
    for series_key, series_data in json_data.items():
        if series_key == specific_series:
            for character, dialogues in series_data.items():
                # Ensure there are at least two dialogues for each character
                if len(dialogues) >= lines_threshold:
                    # Convert dialogues to strings
                    dialogues = [str(dialogue) for dialogue in dialogues]

                    # Create pairs of consecutive dialogues with series information
                    for i in range(len(dialogues) - 1):
                        # Sort the dialogues to ensure consistent order
                        sorted_dialogues = sorted([dialogues[i], dialogues[i + 1]])

                        interaction = (series_key, sorted_dialogues[0], sorted_dialogues[1])
                        interactions.append(interaction)

    # Count occurrences of interactions
    interaction_counts = Counter(interactions)

    # Convert the result to a DataFrame
    df = pd.DataFrame(list(interaction_counts.items()), columns=['Interaction', 'Count'])

    # Split the 'Interaction' column into 'Series', 'Dialogue1', and 'Dialogue2'
    df[['Series', 'Dialogue1', 'Dialogue2']] = pd.DataFrame(df['Interaction'].tolist(), index=df.index)

    # Drop the original 'Interaction' column
    df.drop(columns=['Interaction'], inplace=True)

    # Reorder columns
    df = df[['Series', 'Dialogue1', 'Dialogue2', 'Count']]

    # Group by the combined dialogues and sum the counts
    df_combined = df.groupby(['Series', 'Dialogue1', 'Dialogue2']).agg({'Count': 'sum'}).reset_index()

    # Sort by series, interaction, and count in descending order
    df_combined.sort_values(by=['Series', 'Count'], ascending=[True, False], inplace=True)

    # Filter the DataFrame for a specific series
    filtered_df = df_combined.loc[df_combined['Series'] == specific_series]

    return filtered_df

# Example usage with lines_threshold set to 3
json_file_path = '/Users/mariamaske/Star Trek Analysis/Data/StarTrekDialogue.json'
specific_series = 'PIC'
lines_threshold = 5
result_df = extract_and_filter_interactions(json_file_path, specific_series, lines_threshold)

# Print the final result
print(result_df)

    Series Dialogue1     Dialogue2  Count
139    PIC     RAFFI          RIOS      4
11     PIC     AGNES            OH      3
63     PIC     ELNOR        PICARD      3
123    PIC     NAREK          SOJI      3
134    PIC    PICARD          RIOS      3
..     ...       ...           ...    ...
159    PIC      SOHI            XB      1
160    PIC      SOJI  SOJI + RAFFI      1
161    PIC      SOJI        TENQEM      1
162    PIC     SOONG         SUTRA      1
163    PIC    SYNTHS         WOMAN      1

[164 rows x 4 columns]


In [35]:
# Replace 'Dialogue1', 'Dialogue2', and 'Count' with the actual column names in your dataset
allowed_dialogue1_values = ['PICARD', "AGNES", 'SOJI', 'ELNOR', "RIOS", 'RAFFI']  # Replace with the specific values you're interested in for 'Dialogue1'
allowed_dialogue2_values = ['PICARD', "AGNES", 'SOJI', 'ELNOR', "RIOS", 'RAFFI']  # Replace with the specific values you're interested in for 'Dialogue2'

filtered_df = result_df[
    (result_df['Dialogue1'].isin(allowed_dialogue1_values)) & 
    (result_df['Dialogue2'].isin(allowed_dialogue2_values)) & 
    (result_df['Count'] >= 1)
]

In [36]:
filtered_df.head(50)

Unnamed: 0,Series,Dialogue1,Dialogue2,Count
139,PIC,RAFFI,RIOS,4
63,PIC,ELNOR,PICARD,3
134,PIC,PICARD,RIOS,3
136,PIC,PICARD,SOJI,3
133,PIC,PICARD,RAFFI,2
141,PIC,RAFFI,SOJI,2
12,PIC,AGNES,PICARD,1
13,PIC,AGNES,RAFFI,1
14,PIC,AGNES,RIOS,1
64,PIC,ELNOR,RIOS,1


In [37]:
# exporting Star trek final data
filtered_df.to_csv(os.path.join(path,'StarTrek_interaction_maincast_pic.csv'))

In [38]:
df = filtered_df

# Replace 'Dialogue1', 'Dialogue2', and 'Count' with the actual column names in your dataset

# Create a list of all characters from both 'Dialogue1' and 'Dialogue2'
all_characters = pd.concat([df['Dialogue1'], df['Dialogue2']]).unique()

# Create a DataFrame with all characters
all_characters_df = pd.DataFrame({'Character': all_characters})

# Merge the DataFrame with counts for 'Dialogue1'
all_characters_df = pd.merge(all_characters_df, df.groupby('Dialogue1')['Count'].sum().reset_index(), how='left', left_on='Character', right_on='Dialogue1').rename(columns={'Count': 'Count_Dialogue1'})

# Merge the DataFrame with counts for 'Dialogue2'
all_characters_df = pd.merge(all_characters_df, df.groupby('Dialogue2')['Count'].sum().reset_index(), how='left', left_on='Character', right_on='Dialogue2').rename(columns={'Count': 'Count_Dialogue2'})

# Fill NaN values with 0
all_characters_df['Count_Dialogue1'] = all_characters_df['Count_Dialogue1'].fillna(0)
all_characters_df['Count_Dialogue2'] = all_characters_df['Count_Dialogue2'].fillna(0)

# Calculate the total count
all_characters_df['Total_Count'] = all_characters_df['Count_Dialogue1'] + all_characters_df['Count_Dialogue2']

# Drop unnecessary columns
all_characters_df = all_characters_df[['Character', 'Total_Count']]

# Now, all_characters_df contains a list of characters and their total counts from both 'Dialogue1' and 'Dialogue2'
print(all_characters_df)

  Character  Total_Count
0     RAFFI          9.0
1     ELNOR          4.0
2    PICARD         12.0
3     AGNES          3.0
4      RIOS         10.0
5      SOJI          6.0


In [39]:
# exporting Star trek tos total count data
all_characters_df.to_csv(os.path.join(path,'StarTrek_total_count_interaction_maincast_pic.csv'))

## VOY

In [40]:
def extract_and_filter_interactions(json_file_path, specific_series, lines_threshold=2):
    # Read data from the JSON file
    with open(json_file_path, 'r') as file:
        json_data = json.load(file)
    # Extract interactions with series and episode information
    interactions = []
    for series_key, series_data in json_data.items():
        if series_key == specific_series:
            for character, dialogues in series_data.items():
                # Ensure there are at least two dialogues for each character
                if len(dialogues) >= lines_threshold:
                    # Convert dialogues to strings
                    dialogues = [str(dialogue) for dialogue in dialogues]

                    # Create pairs of consecutive dialogues with series information
                    for i in range(len(dialogues) - 1):
                        # Sort the dialogues to ensure consistent order
                        sorted_dialogues = sorted([dialogues[i], dialogues[i + 1]])

                        interaction = (series_key, sorted_dialogues[0], sorted_dialogues[1])
                        interactions.append(interaction)

    # Count occurrences of interactions
    interaction_counts = Counter(interactions)

    # Convert the result to a DataFrame
    df = pd.DataFrame(list(interaction_counts.items()), columns=['Interaction', 'Count'])

    # Split the 'Interaction' column into 'Series', 'Dialogue1', and 'Dialogue2'
    df[['Series', 'Dialogue1', 'Dialogue2']] = pd.DataFrame(df['Interaction'].tolist(), index=df.index)

    # Drop the original 'Interaction' column
    df.drop(columns=['Interaction'], inplace=True)

    # Reorder columns
    df = df[['Series', 'Dialogue1', 'Dialogue2', 'Count']]

    # Group by the combined dialogues and sum the counts
    df_combined = df.groupby(['Series', 'Dialogue1', 'Dialogue2']).agg({'Count': 'sum'}).reset_index()

    # Sort by series, interaction, and count in descending order
    df_combined.sort_values(by=['Series', 'Count'], ascending=[True, False], inplace=True)

    # Filter the DataFrame for a specific series
    filtered_df = df_combined.loc[df_combined['Series'] == specific_series]

    return filtered_df

# Example usage with lines_threshold set to 3
json_file_path = '/Users/mariamaske/Star Trek Analysis/Data/StarTrekDialogue.json'
specific_series = 'VOY'
lines_threshold = 5
result_df = extract_and_filter_interactions(json_file_path, specific_series, lines_threshold)

# Print the final result
print(result_df)

     Series        Dialogue1      Dialogue2  Count
1103    VOY              KIM          PARIS     50
351     VOY         CHAKOTAY        JANEWAY     41
388     VOY         CHAKOTAY          TUVOK     40
966     VOY          JANEWAY            KIM     40
991     VOY          JANEWAY          TUVOK     38
...     ...              ...            ...    ...
1443    VOY  ZAHL\r OFFICIAL  ZAHL OFFICIAL      1
1444    VOY        pCHAKOTAY           pEMH      1
1445    VOY        pCHAKOTAY         pPARIS      1
1446    VOY          pNEELIX         pTUVOK      1
1447    VOY           pPARIS         pTUVOK      1

[1448 rows x 4 columns]


In [42]:
# Replace 'Dialogue1', 'Dialogue2', and 'Count' with the actual column names in your dataset
allowed_dialogue1_values = ["SEVEN", 'TORRES', 'JANEWAY', 'CHAKOTAY', 'NEELIX', 'PARIS', 'TUVOK', 'DOCTOR', 'KES', 'KIM']  # Replace with the specific values you're interested in for 'Dialogue1'
allowed_dialogue2_values = ["SEVEN", 'TORRES', 'JANEWAY', 'CHAKOTAY', 'NEELIX', 'PARIS', 'TUVOK', 'DOCTOR', 'KES', 'KIM']  # Replace with the specific values you're interested in for 'Dialogue2'

filtered_df = result_df[
    (result_df['Dialogue1'].isin(allowed_dialogue1_values)) & 
    (result_df['Dialogue2'].isin(allowed_dialogue2_values)) & 
    (result_df['Count'] >= 1)
]

In [43]:
filtered_df.head(50)

Unnamed: 0,Series,Dialogue1,Dialogue2,Count
1103,VOY,KIM,PARIS,50
351,VOY,CHAKOTAY,JANEWAY,41
388,VOY,CHAKOTAY,TUVOK,40
966,VOY,JANEWAY,KIM,40
991,VOY,JANEWAY,TUVOK,38
1119,VOY,KIM,TUVOK,37
1329,VOY,PARIS,TUVOK,35
385,VOY,CHAKOTAY,TORRES,32
371,VOY,CHAKOTAY,PARIS,30
981,VOY,JANEWAY,PARIS,30


In [44]:
# exporting Star trek final data
filtered_df.to_csv(os.path.join(path,'StarTrek_interaction_maincast_voy.csv'))

In [45]:
df = filtered_df

# Replace 'Dialogue1', 'Dialogue2', and 'Count' with the actual column names in your dataset

# Create a list of all characters from both 'Dialogue1' and 'Dialogue2'
all_characters = pd.concat([df['Dialogue1'], df['Dialogue2']]).unique()

# Create a DataFrame with all characters
all_characters_df = pd.DataFrame({'Character': all_characters})

# Merge the DataFrame with counts for 'Dialogue1'
all_characters_df = pd.merge(all_characters_df, df.groupby('Dialogue1')['Count'].sum().reset_index(), how='left', left_on='Character', right_on='Dialogue1').rename(columns={'Count': 'Count_Dialogue1'})

# Merge the DataFrame with counts for 'Dialogue2'
all_characters_df = pd.merge(all_characters_df, df.groupby('Dialogue2')['Count'].sum().reset_index(), how='left', left_on='Character', right_on='Dialogue2').rename(columns={'Count': 'Count_Dialogue2'})

# Fill NaN values with 0
all_characters_df['Count_Dialogue1'] = all_characters_df['Count_Dialogue1'].fillna(0)
all_characters_df['Count_Dialogue2'] = all_characters_df['Count_Dialogue2'].fillna(0)

# Calculate the total count
all_characters_df['Total_Count'] = all_characters_df['Count_Dialogue1'] + all_characters_df['Count_Dialogue2']

# Drop unnecessary columns
all_characters_df = all_characters_df[['Character', 'Total_Count']]

# Now, all_characters_df contains a list of characters and their total counts from both 'Dialogue1' and 'Dialogue2'
print(all_characters_df)

  Character  Total_Count
0       KIM        215.0
1  CHAKOTAY        220.0
2   JANEWAY        212.0
3     PARIS        208.0
4    TORRES        172.0
5    NEELIX        142.0
6     SEVEN        105.0
7       KES         49.0
8    DOCTOR          2.0
9     TUVOK        221.0


In [46]:
# exporting Star trek tos total count data
all_characters_df.to_csv(os.path.join(path,'StarTrek_total_count_interaction_maincast_voy.csv'))