# Exploratory Data Analysis

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

from mentor_finder.data import load_raw_committee_csv

pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

## Load data

In [None]:
df = load_raw_committee_csv("data/committee.csv")
df.head()

## Data quality and completeness

### Missing values

In [None]:
df.info()

**Conclusion**: The dataset is complete with no missing values.

## Thesis
### Thesis status

In [None]:
df["thesis_status"].value_counts().plot.bar()
plt.show()

### Thesis title

In [None]:
df["thesis_title_mk"].str.len().plot.hist(bins=30, color='orange')
df["thesis_title_en"].str.len().plot.hist(bins=30, color="green", xlabel="Thesis Title Length", alpha=0.7)
plt.legend(["Title in Macedonian", "Title in English"])
plt.show()

### Thesis abstract

In [None]:
df["thesis_desc_mk"].str.len().plot.hist(bins=50, color='orange')
df["thesis_desc_en"].str.len().plot.hist(bins=50, color="green", xlabel="Thesis Description Length", alpha=0.7)
plt.legend(["Description in Macedonian", "Description in English"])
plt.show()

## Mentor

### Workload distribution

In [None]:
mentor_counts = df["mentor"].value_counts().to_frame()
display(mentor_counts.head(10))
display(mentor_counts.tail(10))
display(mentor_counts.describe())

## Commision members

In [None]:
commision_count = pd.concat([df["c1"], df["c2"]]).value_counts().to_frame()
commision_count.head()

### Workload distribution

In [None]:
display(commision_count.head(10))
display(commision_count.tail(10))
display(commision_count.describe())

## Mentor - Commission member pairs

In [None]:
mentor_commission_pairs = pd.melt(
    df, id_vars=["mentor"], value_vars=["c1", "c2"], value_name="commission_member"
)

mentor_commission_counts = mentor_commission_pairs.groupby(
    ["mentor", "commission_member"]
).agg(count=("commission_member", "count")).groupby(
    level=0, group_keys=False
).apply(lambda x: x.sort_values(ascending=False, by="count"))

mentor_totals = (
    mentor_commission_counts.groupby(level=0).agg(sum=("count", "sum")).sort_values(ascending=False, by="sum")
)
mentor_commission_counts = mentor_commission_counts.reindex(
    mentor_totals.index, level=0
)

mentor_commission_counts=mentor_commission_counts.reset_index()

display(mentor_commission_counts.head(10))
display(mentor_commission_counts.tail(10))

### Mentor commission diversity

In [None]:
commission_diversity = mentor_commission_counts.groupby("mentor").agg(
    commission_diversity=("commission_member", "count")
).sort_values(ascending=False, by="commission_diversity")

display(commission_diversity.head(10))
display(commission_diversity.tail(10))
display(commission_diversity.describe())