# Importing the libraries

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

DATASET_PATH = './data'
GRAPHS_PATH = './graphs'

# Reading the file

In [2]:
df = pd.read_csv(f'{DATASET_PATH}/divorce_data.csv', sep=';')
questions_mean_with_divorce_1 = df.loc[df['Divorce']==1].mean().drop('Divorce')
questions_mean_with_divorce_0 = df.loc[df['Divorce']==0].mean().drop('Divorce')

FileNotFoundError: [Errno 2] No such file or directory: './data/divorce_data.csv'

# Preparing the data to train the model

In [None]:
# The database has features of equal importance, all are relevant --> I include them all
X = df.drop('Divorce', axis=1)
y = df['Divorce']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state=42)

# Training the model

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

# Testing the model

In [None]:
y_predict = model.predict(X_test)
r2 = r2_score(y_test, y_predict)
MAE = mean_absolute_error(y_test, y_predict)
print(f"R2-Score: {r2*100:.2f}%\nMean Absolute Error: {MAE:.2f}")

# Getting the correlation matrix to do some further analysis

In [None]:
# Getting the correlation matrices
corr_matrix = df.corr()

# Extracts the correlation of all variables with 'Divorce'
corr_with_divorce = corr_matrix['Divorce'].drop('Divorce')

# Sort in an asc order
corr_with_divorce_sorted = corr_with_divorce.sort_values(ascending=False)

# Firs 10 questions
first_10_corr = corr_with_divorce_sorted.iloc[:10]

# Last 10 questions
last_10_corr = corr_with_divorce_sorted.tail(10)

# Getting the average of the questions from couples who have divorced
last_10_with_divorce = questions_mean_with_divorce_1[last_10_corr.index]
first_10_with_divorce = questions_mean_with_divorce_1[first_10_corr.index]    

# Getting the average of the questions from couples who didn't divorced
last_10_no_divorce = questions_mean_with_divorce_0[last_10_corr.index]
first_10_no_divorce = questions_mean_with_divorce_0[first_10_corr.index]

# Graph for the correlation of all questions

In [None]:
plt.figure(figsize=(12, 15))
sns.barplot(x=corr_with_divorce_sorted.values, 
            y=corr_with_divorce_sorted.index, 
            palette='coolwarm',
            hue=corr_with_divorce_sorted.index,
            legend=False)                        

plt.title("Influence (Correlation) of each Question on the 'Divorce' Variable", fontsize=16)
plt.xlabel('Pearson Correlation Coefficient', fontsize=12)
plt.ylabel('Question', fontsize=12)

for index, value in enumerate(corr_with_divorce_sorted):
    plt.text(value, index, f' {value:.2f}', va='center')
    
plt.tight_layout()
plt.savefig(f'{GRAPHS_PATH}/all_questions_correlation_sorted_asc.png')
# plt.show() # Uncomment to see it online

# Correlation of the most influent questions

In [None]:
plt.figure(figsize=(10, 4))
sns.barplot(x=first_10_corr.values, 
            y=first_10_corr.index, 
            palette='coolwarm',
            hue=first_10_corr.index,
            legend=False)             

plt.title("Influence (Correlation) of the 10 most influential questions on the variable 'Divorce'", fontsize=16)
plt.xlabel('Pearson Correlation Coefficient', fontsize=12)
plt.ylabel('Question', fontsize=12)

for index, value in enumerate(first_10_corr):
    plt.text(value, index, f' {value:.2f}', va='center')
    
plt.tight_layout()
plt.savefig(f'{GRAPHS_PATH}/most_influent_questions.png')
# plt.show() # Uncomment to see it online

# Correlation of the least influent questions

In [None]:
plt.figure(figsize=(10, 4))
sns.barplot(x=last_10_corr.values, 
            y=last_10_corr.index, 
            palette='coolwarm',
            hue=last_10_corr.index, 
            legend=False)            

plt.title("Influence (Correlation) of the 10 less influential questions on the variable 'Divorce'", fontsize=16)
plt.xlabel('Pearson Correlation Coefficient', fontsize=12)
plt.ylabel('Question', fontsize=12)

for index, value in enumerate(last_10_corr):
    plt.text(value, index, f' {value:.2f}', va='center')
    
plt.tight_layout()
plt.savefig(f'{GRAPHS_PATH}/last_influent_questions.png')
# plt.show() # Uncomment to see it online

# Reading questions text


In [None]:
df_questions = pd.read_csv(f"{DATASET_PATH}/reference.tsv", sep = '|').iloc[:,1].to_numpy()

# Get the number of each question ("Q40" -> 40) that corresponds to the question's position in df_questions
MSQ_first = first_10_with_divorce.index.str.replace('Q','').astype(int)
MSQ_last = last_10_with_divorce.index.str.replace('Q','').astype(int)
MSQ_first_strings = []
MSQ_last_strings = []
for q1,q2 in zip(MSQ_first,MSQ_last):
    MSQ_first_strings.append(df_questions[q1-1])
    MSQ_last_strings.append(df_questions[q2-1])

# Creating a file that summarizes the most influential, least influential questions, with correlation and average response between divorced and non-divorced couples


In [None]:
from IPython.display import display, Markdown

output_file = "Top_influent_questions.txt"

with open(output_file, 'w', encoding='utf-8') as f:
    f.write("Gottman couples therapy\n\n")
    f.write("The couples are from various regions of Turkey wherein the records were acquired from face-to-face interviews from couples who were already divorced or happily married.\n")
    f.write("All responses were collected on a 5 point scale (0=Never, 1=Seldom, 2=Averagely, 3=Frequently, 4=Always).\n\n")
    f.write("Top most-influent questions in Gottman couples therapy\n\n")
    for i in range(len(MSQ_first_strings)):
        f.write(f"{i+1}. {MSQ_first_strings[i]} ")
        f.write(f"with correlation: {first_10_corr.iloc[i]:.2f} and mean for divorced couples: {first_10_with_divorce.iloc[i].round():.0f} and mean for non-divorced couples: {first_10_no_divorce.iloc[i].round():.0f}\n")
        
    f.write("\n\nTop less-influent questions in Gottman couples therapy\n\n")
    for i in range(len(MSQ_last_strings)):
        f.write(f"{i+1}. {MSQ_last_strings[i]} ")
        f.write(f"with correlation: {last_10_corr.iloc[i]:.2f} and mean (for divorced couples): {last_10_with_divorce.iloc[i].round():.0f} and mean for non-divorced couples: {last_10_no_divorce.iloc[i].round():.0f}\n")

# Let's print the file
try:
    with open(output_file, 'r', encoding='utf-8') as f:
        
        content = f.read()
        
        display(Markdown(f"{content}"))

except FileNotFoundError:
    print(f"Errore: File not found: {output_file}")
except Exception as e:
    print(f"An error occurred: {e}")
        

# *Question Search Engine*

In [None]:
NUMERO_DOMANDA = 6
display(Markdown(f"## **{df_questions[NUMERO_DOMANDA]}** \n"))