<a href="https://colab.research.google.com/github/olopopolo/Dissertation_repo/blob/main/PhD_analysis_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np

# Load the dataset
file_path = '/mnt/data/ProgConstructions.xlsx'
data = pd.read_excel(file_path)

# Filter out ambiguous cases
filtered_data = data[data['construction_clean'] != 'ambiguous']

# Calculate frequencies for 'construction_clean' and 'construction_type' without ambiguous cases
filtered_construction_clean_freq = filtered_data['construction_clean'].value_counts().reset_index()
filtered_construction_clean_freq.columns = ['construction_clean', 'frequency']

filtered_construction_type_freq = filtered_data['construction_type'].value_counts().reset_index()
filtered_construction_type_freq.columns = ['construction_type', 'frequency']

# Calculate mean values for 'construction_clean' and 'construction_type' without ambiguous cases
filtered_construction_clean_mean_table = filtered_data.groupby('construction_clean').size().mean()
filtered_construction_type_mean_table = filtered_data.groupby('construction_type').size().mean()

# Create a table for construction clean mean values
construction_clean_mean_table = filtered_construction_clean_freq.copy()
construction_clean_mean_table['mean_value'] = construction_clean_mean_table['frequency'] / construction_clean_mean_table['frequency'].sum() * filtered_construction_clean_mean_table

# Create a table for construction type mean values
construction_type_mean_table = filtered_construction_type_freq.copy()
construction_type_mean_table['mean_value'] = construction_type_mean_table['frequency'] / construction_type_mean_table['frequency'].sum() * filtered_construction_type_mean_table

# Add a percentage column to the mean tables
# For construction clean
total_clean = filtered_construction_clean_freq['frequency'].sum()
construction_clean_mean_table['percentage'] = (construction_clean_mean_table['frequency'] / total_clean) * 100

# For construction type
total_type = filtered_construction_type_freq['frequency'].sum()
construction_type_mean_table['percentage'] = (construction_type_mean_table['frequency'] / total_type) * 100

# Visualization of percentages and raw frequencies with labels and different colors

# Plot for 'construction_clean' percentages and raw frequencies
plt.figure(figsize=(12, 6))
bars = plt.bar(filtered_construction_clean_freq['construction_clean'], construction_clean_mean_table['percentage'], color='skyblue')
plt.xlabel('Construction Clean')
plt.ylabel('Percentage')
plt.title('Percentages and Raw Frequencies of constructional schemas')
plt.xticks(rotation=90)
for i, bar in enumerate(bars):
    yval = bar.get_height()
    raw_freq = filtered_construction_clean_freq['frequency'][i]
    plt.text(bar.get_x() + bar.get_width()/2, yval + 1, f'{yval:.2f}%\n({raw_freq})', ha='center', va='bottom')
plt.show()

# Plot for 'construction_type' percentages and raw frequencies
plt.figure(figsize=(12, 6))
bars = plt.bar(filtered_construction_type_freq['construction_type'], construction_type_mean_table['percentage'], color='lightcoral')
plt.xlabel('Construction Type')
plt.ylabel('Percentage')
plt.title('Percentages and Raw Frequencies of construction types')
plt.xticks(rotation=90)
for i, bar in enumerate(bars):
    yval = bar.get_height()
    raw_freq = filtered_construction_type_freq['frequency'][i]
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.5, f'{yval:.2f}%\n({raw_freq})', ha='center', va='bottom')
plt.show()

# Prepare the data for the network graph with weights based on frequencies
connections = filtered_data.groupby(['construction_clean', 'construction_type']).size().reset_index(name='count')

# Create a graph
G = nx.DiGraph()

# Add edges to the graph with weights
for _, row in connections.iterrows():
    G.add_edge(row['construction_clean'], row['construction_type'], weight=row['count'])

# Node sizes based on frequencies
node_sizes = {}
for node in G.nodes:
    if node in filtered_construction_clean_freq['construction_clean'].values:
        node_sizes[node] = filtered_construction_clean_freq.loc[filtered_construction_clean_freq['construction_clean'] == node, 'frequency'].values[0] * 10
    else:
        node_sizes[node] = filtered_construction_type_freq.loc[filtered_construction_type_freq['construction_type'] == node, 'frequency'].values[0] * 10

# Define node colors
node_colors = ['skyblue' if node in filtered_construction_clean_freq['construction_clean'].values else 'lightcoral' for node in G.nodes]

# Get edge weights for thickness
edge_weights = [G[u][v]['weight'] for u, v in G.edges]
scaled_edge_weights = [weight * 0.05 for weight in edge_weights]

# Draw the graph with a weighted layout and adjusted edge thickness
plt.figure(figsize=(14, 10))
pos = nx.spring_layout(G, k=0.5, weight='weight')
nx.draw(G, pos, with_labels=True, node_size=[node_sizes[node] for node in G.nodes], node_color=node_colors, font_size=10, font_weight="bold", edge_color="grey", width=scaled_edge_weights)

plt.title('Network graph between Constructional Schemas and Construction Types')
plt.show()

# List of unique constructional schemas
construction_schemas = filtered_data['construction_clean'].unique()

# Generate a network graph for each constructional schema
for schema in construction_schemas:
    # Filter data for the current schema
    schema_data = filtered_data[filtered_data['construction_clean'] == schema]

    # Prepare the data with weights based on frequencies
    schema_connections = schema_data.groupby(['construction_clean', 'construction_type']).size().reset_index(name='count')

    # Create a graph
    G_schema = nx.DiGraph()

    # Add edges to the graph with weights
    for _, row in schema_connections.iterrows():
        G_schema.add_edge(row['construction_clean'], row['construction_type'], weight=row['count'])

    # Node sizes based on frequencies
    schema_node_sizes = {}
    for node in G_schema.nodes:
        if node == schema:
            schema_node_sizes[node] = filtered_construction_clean_freq.loc[filtered_construction_clean_freq['construction_clean'] == node, 'frequency'].values[0] * 10
        else:
            schema_node_sizes[node] = filtered_construction_type_freq.loc[filtered_construction_type_freq['construction_type'] == node, 'frequency'].values[0] * 10

    # Define node colors
    schema_node_colors = ['skyblue' if node == schema else 'lightcoral' for node in G_schema.nodes]

    # Get edge weights for thickness
    schema_edge_weights = [G_schema[u][v]['weight'] for u, v in G_schema.edges]
    scaled_schema_edge_weights = [weight * 0.05 for weight in schema_edge_weights]

    # Draw the graph with a weighted layout and adjusted edge thickness
    plt.figure(figsize=(14, 10))
    pos_schema = nx.spring_layout(G_schema, k=0.5, weight='weight')
    nx.draw(G_schema, pos_schema, with_labels=True, node_size=[schema_node_sizes[node] for node in G_schema.nodes], node_color=schema_node_colors, font_size=10, font_weight="bold", edge_color="grey", width=scaled_schema_edge_weights)

    plt.title(f'Network graph for Constructional Schema: {schema}')
    plt.show()

#create a network graph to connect 'construction_clean' with 'meaning'
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np

# Load the dataset
file_path = '/mnt/data/ProgConstructions.xlsx'
data = pd.read_excel(file_path)

# Filter out ambiguous cases
filtered_data = data[data['construction_clean'] != 'ambiguous']

# Calculate frequencies for 'construction_clean' and 'meaning' without ambiguous cases
filtered_construction_clean_meaning_freq = filtered_data.groupby(['construction_clean', 'meaning']).size().reset_index(name='count')

# Create a graph for meanings and constructional schemas
G_meaning_reversed = nx.DiGraph()

# Add edges to the graph with weights based on the frequency of connections (reversed direction)
for _, row in filtered_construction_clean_meaning_freq.iterrows():
    G_meaning_reversed.add_edge(row['meaning'], row['construction_clean'], weight=row['count'])

# Node sizes based on frequencies (reversed)
node_sizes_meaning_reversed = {}
for node in G_meaning_reversed.nodes:
    if node in filtered_construction_clean_meaning_freq['meaning'].values:
        node_sizes_meaning_reversed[node] = filtered_construction_clean_meaning_freq[filtered_construction_clean_meaning_freq['meaning'] == node]['count'].sum() * 10
    else:
        node_sizes_meaning_reversed[node] = filtered_construction_clean_freq.loc[filtered_construction_clean_freq['construction_clean'] == node, 'frequency'].values[0] * 10

# Define node colors (reversed)
node_colors_meaning_reversed = ['lightgreen' if node in filtered_construction_clean_meaning_freq['meaning'].values else 'skyblue' for node in G_meaning_reversed.nodes]

# Get edge weights for thickness (reversed)
edge_weights_meaning_reversed = [G_meaning_reversed[u][v]['weight'] for u, v in G_meaning_reversed.edges]
scaled_edge_weights_meaning_reversed = [weight * 0.05 for weight in edge_weights_meaning_reversed]

# Draw the graph with a weighted layout and adjusted edge thickness
plt.figure(figsize=(14, 10))
pos_meaning_reversed = nx.spring_layout(G_meaning_reversed, k=1.5, iterations=100)  # Further increase k and iterations for more spacing
nx.draw(G_meaning_reversed, pos_meaning_reversed, with_labels=True, node_size=[node_sizes_meaning_reversed[node] for node in G_meaning_reversed.nodes], node_color=node_colors_meaning_reversed, font_size=10, font_weight="bold", edge_color="grey", width=scaled_edge_weights_meaning_reversed)

plt.title('Network graph of Meanings and Constructional Schemas')
plt.show()


In [None]:
# Train a multinomial logistic regression model using sklearn
log_reg_multinomial = LogisticRegression(multi_class='multinomial', max_iter=10000, solver='lbfgs')
log_reg_multinomial.fit(X_scaled, y)

# Get coefficients for each class
coefficients = log_reg_multinomial.coef_

# Create a DataFrame for coefficients
coefficients_df = pd.DataFrame(coefficients.T, index=feature_names, columns=label_encoder.classes_)

# Display the coefficients DataFrame
tools.display_dataframe_to_user(name="Multinomial Logistic Regression Coefficients", dataframe=coefficients_df)

coefficients_df.head()

# Calculate the absolute values of coefficients for better visualization
coefficients_abs = coefficients_df.abs()

# Get the maximum absolute coefficient value for scaling
max_coeff_value = coefficients_abs.values.max()

# Plot the feature importance for each meaning with the same x-axis scale
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(18, 18))
axes = axes.flatten()

for i, meaning in enumerate(coefficients_abs.columns):
    sorted_features = coefficients_abs[meaning].sort_values(ascending=False)
    axes[i].barh(sorted_features.index, sorted_features.values, color='skyblue')
    axes[i].set_xlim(0, max_coeff_value)  # Set the same scale for the x-axis
    axes[i].set_title(f'Feature Importance for {meaning}')
    axes[i].set_xlabel('Absolute Coefficient Value')
    axes[i].invert_yaxis()

plt.tight_layout()
plt.show()


# Plot the feature importance for each meaning with labels for the coefficients
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(18, 18))
axes = axes.flatten()

for i, meaning in enumerate(coefficients_abs.columns):
    sorted_features = coefficients_abs[meaning].sort_values(ascending=False)
    bars = axes[i].barh(sorted_features.index, sorted_features.values, color='skyblue')
    axes[i].set_xlim(0, max_coeff_value)  # Set the same scale for the x-axis
    axes[i].set_title(f'Feature Importance for {meaning}')
    axes[i].set_xlabel('Absolute Coefficient Value')
    axes[i].invert_yaxis()

    # Add labels to the bars
    for bar in bars:
        width = bar.get_width()
        label_x_pos = width + max_coeff_value * 0.01  # Offset the label slightly from the bar
        axes[i].text(label_x_pos, bar.get_y() + bar.get_height() / 2, f'{width:.2f}', va='center')

plt.tight_layout()
plt.show()


In [None]:
from sklearn.decomposition import PCA

# Encode the categorical data for PCA
encoded_data = pd.get_dummies(mca_data)

# Perform PCA
pca = PCA(n_components=2)
pca_transformed = pca.fit_transform(encoded_data)

# Prepare the transformed data for plotting
pca_transformed_df = pd.DataFrame(pca_transformed, columns=['Dimension 1', 'Dimension 2'])
pca_transformed_df['meaning'] = filtered_data_clean['meaning'].values
pca_transformed_df['construction_clean'] = filtered_data_clean['construction_clean'].values

# Plot the PCA results
plt.figure(figsize=(14, 10))
sns.scatterplot(data=pca_transformed_df, x='Dimension 1', y='Dimension 2', hue='meaning', style='construction_clean', s=100)
plt.title('Principal Component Analysis (PCA) of Meanings and Constructions')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


In [None]:
#improve the network graph for 'meaning' and 'construction_clean'
import networkx as nx
import matplotlib.pyplot as plt

# Filter relevant columns
data_filtered = data[['meaning', 'construction_clean']]

# Create a network graph
G = nx.Graph()

# Add nodes and edges
for index, row in data_filtered.iterrows():
    meaning = row['meaning']
    construction = row['construction_clean']
    G.add_node(meaning, label=meaning)
    G.add_node(construction, label=construction)
    G.add_edge(meaning, construction)

# Draw the network graph
plt.figure(figsize=(12, 8))
pos = nx.spring_layout(G)
nx.draw(G, pos, with_labels=True, node_size=3000, node_color="skyblue", font_size=10, font_weight="bold", edge_color="gray")
plt.title('Network Graph of Meanings and Constructions')
plt.show()

# Assign colors based on node type
color_map = []
for node in G:
    if node in data_filtered['meaning'].values:
        color_map.append('skyblue')  # Color for meanings
    else:
        color_map.append('lightgreen')  # Color for constructions

# Draw the network graph with different colors
plt.figure(figsize=(12, 8))
pos = nx.spring_layout(G)
nx.draw(G, pos, with_labels=True, node_size=3000, node_color=color_map, font_size=10, font_weight="bold", edge_color="gray")
plt.title('Network Graph of Meanings and Constructions')
plt.show()

# Filter out ambiguous cases
filtered_data = data_filtered[data_filtered['meaning'] != 'ambiguous']

# Create a new network graph without ambiguous cases
G_filtered = nx.Graph()

# Add nodes and edges for the filtered data
for index, row in filtered_data.iterrows():
    meaning = row['meaning']
    construction = row['construction_clean']
    G_filtered.add_node(meaning, label=meaning)
    G_filtered.add_node(construction, label=construction)
    G_filtered.add_edge(meaning, construction)

# Calculate degree centrality for the filtered graph
degree_centrality_filtered = nx.degree_centrality(G_filtered)

# Assign colors based on node type and scale sizes based on degree centrality
color_map_filtered = []
sizes = []
for node in G_filtered:
    if node in filtered_data['meaning'].values:
        color_map_filtered.append('skyblue')  # Color for meanings
    else:
        color_map_filtered.append('lightgreen')  # Color for constructions
    sizes.append(degree_centrality_filtered[node] * 3000)  # Scale size for better visualization

# Draw the filtered network graph with degree centrality
plt.figure(figsize=(12, 8))
pos = nx.spring_layout(G_filtered)
nx.draw(G_filtered, pos, with_labels=True, node_size=sizes, node_color=color_map_filtered, font_size=10, font_weight="bold", edge_color="gray")
plt.title('Network Graph of Meanings and Constructions (Filtered)')
plt.show()


In [None]:
# Calculate the frequency (strength) of each meaning-construction pair
pair_strengths = filtered_data_clean.groupby(['meaning', 'construction_clean']).size().reset_index(name='strength')

# Display the table of strengths between meanings and constructions
tools.display_dataframe_to_user(name="Strengths Between Meanings and Constructions", dataframe=pair_strengths)

pair_strengths.head()


# Calculate the total occurrences
total_occurrences = pair_strengths['strength'].sum()

# Calculate normalized strength
pair_strengths['normalized_strength'] = pair_strengths['strength'] / total_occurrences

# Display the updated table with normalized strength
tools.display_dataframe_to_user(name="Normalized Strengths Between Meanings and Constructions", dataframe=pair_strengths)

pair_strengths.head()

from scipy.stats import chi2_contingency
import numpy as np

# Create a contingency table
contingency_table = pd.crosstab(filtered_data_clean['meaning'], filtered_data_clean['construction_clean'])

# Perform Chi-Square test
chi2, p, dof, expected = chi2_contingency(contingency_table)

# Calculate Cramér's V
n = np.sum(contingency_table.values)
cramers_v = np.sqrt(chi2 / (n * (min(contingency_table.shape) - 1)))

# Prepare the results
stats_results = {
    "Chi-Square Statistic": chi2,
    "p-value": p,
    "Degrees of Freedom": dof,
    "Cramér's V": cramers_v
}

stats_results_df = pd.DataFrame(list(stats_results.items()), columns=['Statistic', 'Value'])

tools.display_dataframe_to_user(name="Chi-Square Test Results", dataframe=stats_results_df)

stats_results_df

# Calculate expected frequencies and residuals
observed = contingency_table.values
expected = expected.astype(int)

# Calculate residuals
residuals = observed - expected

# Create a DataFrame for the residuals
residuals_df = pd.DataFrame(residuals, index=contingency_table.index, columns=contingency_table.columns)

# Identify significant pairs based on standardized residuals
# Residuals are considered significant if their absolute value is greater than 2
significant_pairs = residuals_df[residuals_df.abs() > 2].stack().reset_index()
significant_pairs.columns = ['Meaning', 'Construction', 'Residual']

tools.display_dataframe_to_user(name="Significant Construction-Meaning Pairs", dataframe=significant_pairs)

significant_pairs.head()

# Create a mask for significant residuals
mask = residuals_df.abs() <= 2

# Generate a heatmap of the residuals, highlighting significant pairs
plt.figure(figsize=(12, 8))
sns.heatmap(residuals_df, annot=True, mask=mask, cmap="coolwarm", center=0, cbar_kws={'label': 'Residuals'})
plt.title('Heatmap of Residuals Highlighting Significant Construction-Meaning Pairs')
plt.xlabel('Construction')
plt.ylabel('Meaning')
plt.show()



In [None]:
# Calculate the frequencies for the aspectotemporal meanings and intersubjective connotations
aspectotemporal_frequencies = data['meaning'].apply(lambda x: x if x in aspectotemporal_meanings else None).value_counts()
intersubjective_frequencies = data['meaning'].apply(lambda x: x if x in intersubjective_connotations else None).value_counts()

# Create a network graph with scaled nodes and an additional superordinate concept
G = nx.Graph()

# Add superordinate node
G.add_node("epistemic contingency", group='superordinate')

# Add nodes with frequencies as their sizes
for meaning, freq in aspectotemporal_frequencies.items():
    G.add_node(meaning, group='aspectotemporal', size=freq)

for meaning, freq in intersubjective_frequencies.items():
    G.add_node(meaning, group='intersubjective', size=freq)

# Add central nodes
G.add_node("aspectotemporal meanings", group='core', size=aspectotemporal_frequencies.sum())
G.add_node("intersubjective connotations", group='core', size=intersubjective_frequencies.sum())

# Add edges
for meaning in aspectotemporal_frequencies.index:
    G.add_edge(meaning, "aspectotemporal meanings")

for meaning in intersubjective_frequencies.index:
    G.add_edge(meaning, "intersubjective connotations")

# Connect core nodes to the superordinate concept
G.add_edge("aspectotemporal meanings", "epistemic contingency")
G.add_edge("intersubjective connotations", "epistemic contingency")

# Position nodes
pos = nx.spring_layout(G, k=0.5)

# Draw the graph with scaled nodes
plt.figure(figsize=(14, 10))

# Draw nodes with colors based on their group and sizes based on frequencies
colors = [G.nodes[node]['group'] for node in G.nodes]
sizes = [G.nodes[node].get('size', 100) * 50 for node in G.nodes]  # Scale sizes for better visibility

color_map = {
    'aspectotemporal': 'skyblue',
    'intersubjective': 'lightcoral',
    'core': 'yellow',
    'superordinate': 'green'
}
node_colors = [color_map[color] for color in colors]

nx.draw(G, pos, with_labels=True, node_color=node_colors, edge_color='gray', node_size=sizes, font_size=10, font_weight='bold')

plt.title('Network Graph of Meaning Types with Epistemic Contingency')
plt.show()


In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

# Load the dataset
file_path = 'Path_to_Your_File.csv'
data = pd.read_csv(file_path)

# Extract relevant columns
construction_types = data['construction_type']
construction_clean = data['construction_clean']

# Create a DataFrame with the frequency of each pair (construction_clean, construction_type)
edges = data.groupby(['construction_clean', 'construction_type']).size().reset_index(name='weight')

# Create the graph
G = nx.DiGraph()

# Add nodes for construction_clean and construction_type with frequencies
node_sizes = {}
for node in construction_clean:
    if node not in node_sizes:
        node_sizes[node] = construction_clean.value_counts()[node]
for node in construction_types:
    if node not in node_sizes:
        node_sizes[node] = construction_types.value_counts()[node]

# Add nodes with attributes
duplicate_nodes = set(construction_clean) & set(construction_types)
for node, size in node_sizes.items():
    if node in construction_clean.unique():
        G.add_node(node, size=size, color='blue')
    elif node in duplicate_nodes:
        G.add_node(node + '_type', size=size, color='red')
    else:
        G.add_node(node, size=size, color='red')

# Add edges with weights
for _, row in edges.iterrows():
    if row['construction_type'] in duplicate_nodes:
        G.add_edge(row['construction_clean'], row['construction_type'] + '_type', weight=row['weight'])
    else:
        G.add_edge(row['construction_clean'], row['construction_type'], weight=row['weight'])

# Set up the plot size and remove the grid
plt.figure(figsize=(15, 10))
plt.axis('off')  # This removes the background grid

# Get positions for the nodes with more frequent nodes more central
pos_central = nx.spring_layout(G, k=0.5, center=[0.5, 0.5])

# Draw the nodes with attributes
sizes = [G.nodes[node].get('size', 10)*10 for node in G.nodes()]
colors = [G.nodes[node].get('color', 'black') for node in G.nodes()]
nx.draw_networkx_nodes(G, pos_central, node_size=sizes, node_color=colors)

# Draw the edges with adjusted weights
weights = [G[u][v].get('weight', 1) for u, v in G.edges()]
scaled_weights = [w/50 for w in weights]
nx.draw_networkx_edges(G, pos_central, width=scaled_weights)

# Draw the labels with further offset to avoid overlapping
labels = {node: node.replace('_type', '') for node in G.nodes()}
nx.draw_networkx_labels(G, pos_central, labels=labels, font_size=8, verticalalignment='center', horizontalalignment='right')

# Update the title
plt.title('Network graph of constructional schemas and construction types')

# Save to PDF
pdf_file_path = 'Network_Graph_Constructional_Schemas_and_Types.pdf'
plt.savefig(pdf_file_path, format='pdf', bbox_inches='tight')
plt.close()

print(f"Graph saved as {pdf_file_path}")


In [None]:
import pandas as pd

# Load the dataset
file_path = '/mnt/data/ProgConstructionsMerged.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
data.head()

# Create a new column combining 'construction_type', 'construction_clean', and 'meaning'
data['symbolic_relation'] = data.apply(lambda x: f"{x['construction_type']} ({x['construction_clean']}): {x['meaning']}", axis=1)

# Display the new column and the relevant original columns for comparison
data[['construction_type', 'construction_clean', 'meaning', 'symbolic_relation']].head()

import seaborn as sns
import matplotlib.pyplot as plt

# Calculate the frequency of each symbolic relation
frequency_table = data['symbolic_relation'].value_counts().reset_index()
frequency_table.columns = ['Symbolic Relation', 'Frequency']

# Calculate relative frequencies
total_entries = len(data)
frequency_table['Relative Frequency'] = frequency_table['Frequency'] / total_entries * 100

# Display the frequency table
frequency_table

# Prepare data for heatmap
heatmap_data = data.pivot_table(index='construction_clean', columns='meaning', values='symbolic_relation', aggfunc='count', fill_value=0)

# Plot heatmap for absolute frequencies
plt.figure(figsize=(12, 8))
sns.heatmap(heatmap_data, annot=True, fmt="d", cmap="YlGnBu")
plt.title('Heatmap of Absolute Frequencies of Symbolic Relations')
plt.ylabel('Construction Form (Clean)')
plt.xlabel('Meaning')
plt.show()

# Plot heatmap for relative frequencies
heatmap_data_relative = heatmap_data.div(heatmap_data.sum().sum()).multiply(100)
plt.figure(figsize=(12, 8))
sns.heatmap(heatmap_data_relative, annot=True, fmt=".2f", cmap="YlGnBu")
plt.title('Heatmap of Relative Frequencies of Symbolic Relations')
plt.ylabel('Construction Form (Clean)')
plt.xlabel('Meaning')
plt.show()

from scipy.stats import chi2_contingency
import numpy as np

# Prepare the data excluding 'ambiguous' entries
analysis_data = data[(data['construction_clean'] != 'ambiguous') & (data['meaning'] != 'ambiguous')]

# Create a contingency table
contingency_table = pd.crosstab(analysis_data['construction_clean'], analysis_data['meaning'])

# Perform the Chi-Square Test of Independence
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

chi2, p_value, dof

# Calculate contribution of each construction to each meaning as a percentage of the total for that meaning
contribution_percentages = contingency_table.div(contingency_table.sum(axis=0), axis=1) * 100

# Display the contribution percentages in a readable format
contribution_percentages.transpose()

# Plotting the contribution percentages as a heatmap for visual representation
plt.figure(figsize=(14, 10))
sns.heatmap(contribution_percentages.transpose(), annot=True, fmt=".2f", cmap="coolwarm", linewidths=.5)
plt.title('Contribution of Each Construction to Certain Meanings')
plt.xlabel('Construction Schema')
plt.ylabel('Meaning')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()

# Save the visual representation to a PDF
visual_contribution_pdf = "/mnt/data/Visual_Contribution_Heatmap.pdf"
plt.savefig(visual_contribution_pdf)
plt.show()

visual_contribution_pdf

# Calculate the standardized residuals (contributions to chi-squared)
residuals = (contingency_table - expected) / np.sqrt(expected)

# Plotting the standardized residuals as a heatmap
plt.figure(figsize=(14, 10))
sns.heatmap(residuals.transpose(), annot=True, fmt=".2f", cmap="coolwarm", center=0, linewidths=.5)
plt.title('Standardized Residuals for Chi-Squared Contributions')
plt.xlabel('Construction Schema')
plt.ylabel('Meaning')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()

# Save the residuals heatmap to a PDF
residuals_heatmap_pdf = "/mnt/data/Residuals_Chi_Squared_Contribution.pdf"
plt.savefig(residuals_heatmap_pdf)
plt.show()

residuals_heatmap_pdf

from scipy.stats import chi2_contingency

# Initialize a DataFrame to store the chi-square results
chi_square_results = pd.DataFrame(columns=['Construction', 'Meaning', 'Chi-Square Statistic', 'p-value', 'Degrees of Freedom'])

# Iterate over each construction schema and meaning to perform chi-square tests
for construction in contingency_table.index:
    for meaning in contingency_table.columns:
        # Construct the contingency table for the pair
        observed = pd.crosstab(data['construction_clean'] == construction, data['meaning'] == meaning)

        # Perform the chi-square test
        chi2, p, dof, _ = chi2_contingency(observed)

        # Store the results
        chi_square_results = chi_square_results.append({
            'Construction': construction,
            'Meaning': meaning,
            'Chi-Square Statistic': chi2,
            'p-value': p,
            'Degrees of Freedom': dof
        }, ignore_index=True)

# Display the results in a table format
chi_square_results.head(len(chi_square_results))  # Showing all results could be large, adjust as needed

# Calculate Cramer's V for each pair and store the results in a DataFrame
def cramers_v(chi2, n):
    return np.sqrt(chi2 / (n * min(contingency_table.shape) - 1))

# Initialize a DataFrame to store Cramer's V results
cramers_v_results = pd.DataFrame(columns=contingency_table.columns, index=contingency_table.index)

# Iterate over each construction schema and meaning to calculate Cramer's V
for construction in contingency_table.index:
    for meaning in contingency_table.columns:
        # Construct the contingency table for the pair
        observed = pd.crosstab(data['construction_clean'] == construction, data['meaning'] == meaning)

        # Perform the chi-square test
        chi2, p, dof, _ = chi2_contingency(observed)

        # Calculate Cramer's V
        cramers_v_value = cramers_v(chi2, observed.sum().sum())
        cramers_v_results.loc[construction, meaning] = cramers_v_value

# Convert the results to float
cramers_v_results = cramers_v_results.astype(float)

# Display the results in a table format
cramers_v_results

# Calculate Cramer's V for each pair and store the results in a DataFrame
def cramers_v(chi2, n):
    return np.sqrt(chi2 / (n * min(contingency_table.shape) - 1))

# Initialize a DataFrame to store Cramer's V results
cramers_v_results = pd.DataFrame(columns=contingency_table.columns, index=contingency_table.index)

# Iterate over each construction schema and meaning to calculate Cramer's V
for construction in contingency_table.index:
    for meaning in contingency_table.columns:
        # Construct the contingency table for the pair
        observed = pd.crosstab(data['construction_clean'] == construction, data['meaning'] == meaning)

        # Perform the chi-square test
        chi2, p, dof, _ = chi2_contingency(observed)

        # Calculate Cramer's V
        cramers_v_value = cramers_v(chi2, observed.sum().sum())
        cramers_v_results.loc[construction, meaning] = cramers_v_value

# Convert the results to float
cramers_v_results = cramers_v_results.astype(float)

# Display the results in a table format
cramers_v_results


In [None]:
from scipy.stats import chi2_contingency
import numpy as np

# Filtering out ambiguous entries
filtered_data = data[(data['construction_clean'] != 'ambiguous') & (data['meaning'] != 'ambiguous')]

# Creating a contingency table for each pair of construction_clean and meaning
contingency_tables = {}
chi2_results = []

# Get unique values after filtering
construction_types = filtered_data['construction_clean'].unique()
meanings = filtered_data['meaning'].unique()

for construction in construction_types:
    for meaning in meanings:
        # Creating a contingency table for each pair
        contingency = pd.crosstab(filtered_data['construction_clean'] == construction,
                                  filtered_data['meaning'] == meaning)
        contingency_tables[(construction, meaning)] = contingency

        # Performing Chi-squared test
        chi2, p, dof, expected = chi2_contingency(contingency)
        chi2_results.append({
            'Construction': construction,
            'Meaning': meaning,
            'Chi2 Statistic': chi2,
            'p-value': p,
            'Significance': p < 0.05,  # Consider significance at alpha = 0.05
            'Observed Frequency': contingency.iloc[1, 1],  # Frequency of pair co-occurrence
            'Expected Frequency': expected[1, 1]  # Expected frequency if independent
        })

# Creating a DataFrame for results
chi2_results_df = pd.DataFrame(chi2_results)
chi2_results_df.sort_values('p-value', inplace=True)  # Sort by p-value for highlighting significance

chi2_results_df.head(10), chi2_results_df.tail(10)  # Showing top and bottom 10 results for brevity


In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree

# Creating a Decision Tree model
dtree = DecisionTreeClassifier(max_depth=5, random_state=42)

# Fitting the model
dtree.fit(X[['construction_encoded', 'adv_function_encoded']], y)

# Plotting the decision tree
plt.figure(figsize=(20, 10))
tree_plot = plot_tree(dtree, feature_names=['construction_encoded', 'adv_function_encoded'],
                      class_names=le_meaning.classes_, filled=True, rounded=True, fontsize=12)

plt.show()


In [None]:
from statsmodels.stats.multitest import multipletests
from itertools import combinations

# List of unique school languages
school_languages = data['author school language'].unique()

# Initialize a list to store the pairwise comparison results
pairwise_results = []

# Perform pairwise chi-square tests for each constructional schema
for construction in construction_types:
    for lang1, lang2 in combinations(school_languages, 2):
        contingency_table = pd.crosstab(
            data[data['author school language'].isin([lang1, lang2])]['author school language'],
            data[data['author school language'].isin([lang1, lang2])]['construction_clean'] == construction
        )
        chi2, p, dof, expected = chi2_contingency(contingency_table)
        pairwise_results.append({
            'Construction': construction,
            'Language Pair': f'{lang1} vs {lang2}',
            'Chi-square': chi2,
            'p-value': p
        })

# Convert results to a DataFrame
pairwise_results_df = pd.DataFrame(pairwise_results)

# Apply multiple testing correction (Bonferroni correction)
corrected_pvals = multipletests(pairwise_results_df['p-value'], method='bonferroni')
pairwise_results_df['corrected p-value'] = corrected_pvals[1]

tools.display_dataframe_to_user(name="Pairwise Chi-square Test Results for Each Constructional Schema", dataframe=pairwise_results_df)

pairwise_results_df.sort_values(by='corrected p-value').head()

# List of unique clusters
clusters = data['clusters_olga'].unique()

# Initialize a list to store the pairwise comparison results
pairwise_results_clusters = []

# Perform pairwise chi-square tests for each construction schema
for construction in construction_types:
    for cluster1, cluster2 in combinations(clusters, 2):
        contingency_table = pd.crosstab(
            data[data['clusters_olga'].isin([cluster1, cluster2])]['clusters_olga'],
            data[data['clusters_olga'].isin([cluster1, cluster2])]['construction_clean'] == construction
        )
        chi2, p, dof, expected = chi2_contingency(contingency_table)
        pairwise_results_clusters.append({
            'Construction': construction,
            'Cluster Pair': f'{cluster1} vs {cluster2}',
            'Chi-square': chi2,
            'p-value': p
        })

# Convert results to a DataFrame
pairwise_results_clusters_df = pd.DataFrame(pairwise_results_clusters)

# Apply multiple testing correction (Bonferroni correction)
corrected_pvals_clusters = multipletests(pairwise_results_clusters_df['p-value'], method='bonferroni')
pairwise_results_clusters_df['corrected p-value'] = corrected_pvals_clusters[1]

tools.display_dataframe_to_user(name="Pairwise Chi-square Test Results for Clusters Olga", dataframe=pairwise_results_clusters_df)

pairwise_results_clusters_df.sort_values(by='corrected p-value').head()



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# Load your data
data = pd.read_csv('/mnt/data/ProgConstructionsMerged.csv')

# Filter out the "ambiguous" category
filtered_data_no_ambiguous = data[data['construction_clean'] != 'ambiguous']

# Group by author_id and construction_clean to see individual contributions
author_contributions = filtered_data_no_ambiguous.groupby(['author_id', 'construction_clean']).size().unstack().fillna(0)

# Standardize the data for PCA and clustering
scaler = StandardScaler()
scaled_data = scaler.fit_transform(author_contributions)

# Perform PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled_data)

# Perform K-means clustering on the standardized data
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(scaled_data)

# Add cluster information back to author_contributions DataFrame
author_contributions['Cluster'] = clusters

# Map the cluster labels back to the original data
filtered_data_no_ambiguous['Cluster'] = filtered_data_no_ambiguous['author_id'].map(author_contributions['Cluster'])

# Ensure we are selecting the correct columns for constructional schemas in the filtered data
schema_columns = ['V+ADV', 'V1+and+V1', 'Vaux+V', 'VingGER', 'aspectual+V', 'beim+Vinf+sein', 'find+V', 'perception+V']

# Add schema columns to the filtered data by merging with author_contributions
filtered_data_with_schemas = filtered_data_no_ambiguous.merge(author_contributions[schema_columns], left_on='author_id', right_index=True, how='left')

# Calculate the average usage of each constructional schema within each cluster
cluster_means_corrected = filtered_data_with_schemas.groupby('Cluster')[schema_columns].mean()

# Visualize the PCA results with clustering information
plt.figure(figsize=(14, 8))
sns.scatterplot(x=pca_result[:, 0], y=pca_result[:, 1], hue=clusters, palette='viridis', legend='full')
plt.title('PCA of Authors Based on Constructional Schema Usage with Cluster Information')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Cluster')
plt.show()

# Visualize the cluster means for constructional schemas
plt.figure(figsize=(14, 8))
sns.heatmap(cluster_means_corrected.T, annot=True, cmap='YlGnBu', cbar=True)
plt.title('Cluster Means for Constructional Schemas')
plt.xlabel('Cluster')
plt.ylabel('Construction Schema')
plt.show()


In [None]:
#creating a network graph with branching principles
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

# Reload the data
file_path = '/mnt/data/ProgConstructionsMerged.csv'
data = pd.read_csv(file_path)

# Extract relevant columns for analysis
meaning_data = data['meaning']

# Calculate frequencies of each meaning
meaning_frequencies = meaning_data.value_counts()

# Define main meaning and its subtypes
main_meaning = 'ongoingness'
aspecto_temporal = ['orientation', 'virtual ongoingness', 'duration', 'temporary validity', 'habitual']
intersubjective_connotations = ['emphasis', 'modal']

# Define node sizes based on frequencies
node_sizes = {meaning: meaning_frequencies[meaning] * 10 for meaning in meaning_frequencies.index}

# Create a new graph for the updated layout
G_final = nx.DiGraph()

# Add nodes for all subtypes and the main meaning
for subtype in subtypes:
    G_final.add_node(subtype, size=node_sizes[subtype])
G_final.add_node(main_meaning, size=node_sizes[main_meaning])

# Add edges for the main meaning and subtypes with the corrected branching principles
# Singular and actual: orientation, duration, temporary validity
singular_actual = ['orientation', 'duration', 'temporary validity']
for subtype in singular_actual:
    G_final.add_edge('ongoingness', subtype, branching='singular and actual')
# Connect nodes within this principle
for i in range(len(singular_actual) - 1):
    for j in range(i + 1, len(singular_actual)):
        G_final.add_edge(singular_actual[i], singular_actual[j], branching='singular and actual')

# Virtual and actual: virtual ongoingness
G_final.add_edge('ongoingness', 'virtual ongoingness', branching='virtual and actual')

# Multiple and virtual: habitual
G_final.add_edge('ongoingness', 'habitual', branching='multiple and virtual')

# Non-temporal: modal, emphasis
non_temporal = ['modal', 'emphasis']
for subtype in non_temporal:
    G_final.add_edge('ongoingness', subtype, branching='non-temporal')
# Connect nodes within this principle
G_final.add_edge('modal', 'emphasis', branching='non-temporal')

# Add dotted line connection from virtual ongoingness to habitual with label 'virtual'
G_final.add_edge('virtual ongoingness', 'habitual', branching='virtual', style='dotted')

# Define colors for the groups
node_colors = {
    **{node: 'lightgreen' for node in aspecto_temporal},
    **{node: 'lightcoral' for node in intersubjective_connotations},
    'ongoingness': 'skyblue'
}

# Draw the final graph with labeled edges and dotted line
plt.figure(figsize=(16, 16))
pos_final = nx.circular_layout(G_final)

colors_final = [node_colors[node] for node in G_final.nodes]
sizes_final = [node_sizes[node] for node in G_final.nodes]

# Draw nodes
nx.draw_networkx_nodes(G_final, pos_final, node_size=sizes_final, node_color=colors_final)

# Draw solid edges
solid_edges = [(u, v) for u, v, d in G_final.edges(data=True) if d.get('style') != 'dotted']
nx.draw_networkx_edges(G_final, pos_final, edgelist=solid_edges, edge_color='gray')

# Draw dotted edges
dotted_edges = [(u, v) for u, v, d in G_final.edges(data=True) if d.get('style') == 'dotted']
nx.draw_networkx_edges(G_final, pos_final, edgelist=dotted_edges, edge_color='gray', style='dotted')

# Draw labels
nx.draw_networkx_labels(G_final, pos_final, font_size=10, font_family="sans-serif", verticalalignment='bottom')

# Draw edge labels for branching principles
edge_labels = nx.get_edge_attributes(G_final, 'branching')
nx.draw_networkx_edge_labels(G_final, pos_final, edge_labels=edge_labels, font_size=8)

# Add the legend
plt.legend(handles=legend_elements, loc='upper right')
plt.title('Network Graph of Meaning with Corrected Branching Principles')

# Save the plot as a PNG file
file_path_png = '/mnt/data/network_graph_meaning.png'
plt.savefig(file_path_png)
``` &#8203;:citation[oaicite:0]{index=0}&#8203;
