# Final exam

In [None]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline # to plot in notebook

import seaborn as sns
import scipy.stats as stats

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score, balanced_accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import Ridge, LinearRegression, LogisticRegressionCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

### 0. Tips

 * The Collections library is your friend when using dictionaries
 * Collections library is your friend, for counting multiple objects
 * Reading files should be done with the following construct  

In [None]:
DATA_FOLDER = 'Data/'
COUNTRIES_DATASET = DATA_FOLDER+"happiness2020.csv"

with open('file', 'r') as file:
    print(file.read())

# Read a csv
countries = pd.read_csv(COUNTRIES_DATASET, decimal=',', sep=',', header=1, skiprows=0)

# Read a tsv.gz
edges = pd.read_table("data/links_task-B.tsv.gz")
paths = pd.read_csv('data/paths_df_task-B.tsv.gz', sep="\t", compression='gzip')

In [None]:
df=pd.DataFrame()
my_dict = {}
G=nx.Graph()
def some_function():
    return 0

# Usefull functions
df.ffill(axis=0, inplace=True)
df.dropna(inplace=True, axis=0)

df.fillna(value=None, method=None, axis=None, inplace=False)
values = {"A": 0, "B": 1, "C": 2, "D": 3}
df.fillna(value=values) # Replace all NaN elements in column ‘A’, ‘B’, ‘C’, and ‘D’, with 0, 1, 2, and 3 respectively.

df.rename(columns={0:'line_', 1:'scene', 2:'serie_episode'}, inplace=True)
df[['serie','episode']] = df.apply(lambda line: line['serie_episode'].split("Episode", 1), result_type='expand', axis=1)
script = pd.concat([df, df], axis='columns').drop(["serie_episode", "line_"], axis=1)

df = pd.get_dummies(df) # pandas.get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None)

In [None]:
# DF.LOC
df.set_index('channel', inplace=True)                   # Set 'channel' as the index to use loc
channel_category = df.loc['PewDiePie', 'channel_cat']   # output just the name

# inverse a dictionary
index_to_channel = dict((v, k) for k, v in my_dict.items())
index_to_cat = dict((k, df.loc[v,'channel_cat']) for k, v in index_to_channel.items())

# iterate to create a vector with a if without else + DICTIONNARY
array = [x for x in channel_cats if channel_cats[x] == category]
array = [(x['SRC'], x['TGT']) for i, x in df.iterrows()]
dict_edges_with_att = {(x['SRC'], x['TGT']): x['VOT_RND'] for i, x in df.iterrows()}
degree_centrality = nx.degree_centrality(G)
data = {k: v for k, v in sorted(degree_centrality.items(), key=lambda item: item[1])}

# dict without a key
my_dict.get('key', 0) # or None
eigenvector_centrality = pd.Series(dict(nx.algorithms.eigenvector_centrality(G)))
in_degree = pd.Series(dict(G.in_degree()))
df_cent = pd.DataFrame([eigenvector_centrality, in_degree]).T.rename({0: "eigenvector_centrality",
                                                                      1: "in_degree"}, axis=1)

In [None]:
# Getting sorted flights for that day
flights = df[(df['day'] == 1) & (df['airport'] == 2)] # 2 cond to filter df, with an AND 
flights = df[(df['day'] == 1) | (df['airport'] == 2)] # 2 cond to filter df, with an OR 
flights = flights.sort_values('hour')[['flight_id', 'dest_id']]

# smart ways to create lists
my_list = [10*x+1 for x in range(10)]

# agg function to get stats on some col
result = df.agg(['std', 'sum', 'mean'])
result = df.agg({'A': ['sum', 'min'], 'B': 'max'}) # diff stats on diff col

# groupby example
df.groupby(df.name).seg_length.sum().sort_values(ascending=False, inplace=False).head()
df.join(df).assign(avg_views=lambda x: x['view_count']/x['channel'])[['avg_views']].\
                                plot(kind='bar')

# The concat() function (in the main pandas namespace) does all of the heavy lifting of performing concatenation operations along an axis while performing optional set logic (union or intersection) of the indexes (if any) on the other axes.
# DataFrame.join() is a convenient method for combining the columns of two potentially differently-indexed DataFrames into a single result DataFrame
# Prefer merge to join, because join is for index-related work!

In [None]:
def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return len(s1.intersection(s2)) / len(s1.union(s2))

In [None]:
from collections import Counter # returns a hashable dict 
element_counts = Counter(['apple', 'banana', 'apple', 'orange', 'banana', 'apple'])
character_counts = Counter("hello world") 
# Dictionary: If a dictionary is provided, it counts the frequency of each value. If you want to count keys, you can directly convert the dictionary keys to a list or an iterable.
# Set: With a set, it will count the frequency of each element, but since sets do not contain duplicates, each element will have a count of 1.

In [None]:
numbers = [1, 2, 3]
letters = ['a', 'b', 'c']
zipped = zip(numbers, letters) 
# list(zipped) = [(1, 'a'), (2, 'b'), (3, 'c')]

### 1. Visualisation

Plot with two series on the same graph

In [None]:
plt.plot(indegree, label="yoooooo")
plt.plot(outdegree, label="yoooooo")
plt.xlabel("yoooooo")
plt.legend()
plt.show();

Histogram

In [None]:
# bin the data in x and count the number of values in each bin
# If the data has already been binned and counted, use bar or stairs to plot the distribution:

# we use pandas wrapper
df['col'].hist(bins = 100)

# alternatively, we can use matplotlib directly
plt.hist(df['col'].values, bins = 100)
plt.xticks(rotation=90)
plt.xlabel('yooooo')
plt.ylabel('yooooo')
plt.title('yooooo');

Bar (histo with two values, x and height)

In [None]:
topic_dist = df['topic'].value_counts()
topic_dist.plot(kind='bar')
plt.show()

plt.bar(x=df.index, height=df.title)
plt.xlabel('year')
plt.ylabel('videos')

# horizontal bar with error
sqrt_N = df['total_posts'].apply(lambda r: math.sqrt(r))
s = df['posts_length_stddev']
df['ci99'] = 2.576*(s / sqrt_N)

plt.barh(df.subreddit, df.posts_length, xerr=df.ci99)
plt.xlabel('Post length average (CI 99%)')
plt.ylabel('Subreddit')
plt.title('Average posts length')
plt.show()

Boxplot

In [None]:
plt.boxplot(df['col'])
plt.xticks([])
plt.title('TITLE');

Scatterplot (2 variables)

In [None]:
plt.scatter(df['col1'], df['col2'], s = 2, alpha=0.2)
plt.xlabel('yoooooo')
plt.ylabel('yoooooo')

Heatmap

In [None]:
heatmap = np.zeros((5, 5))
for i in range(5):
    for j in range(5):
        f = some_function(G, i, j)
        heatmap[i][j]=f

sns.heatmap(heatmap, annot=True)
plt.ylabel("yoooooo")
plt.xlabel("yoooooo")
plt.title("yoooooo")
plt.show();

Pointplot (comparer selon les categories de x la valeur de y) (corr exam2020)

In [None]:
fig, axs = plt.subplots(1,2, figsize=(15, 5))
sns.pointplot(x="finished", y="eigenvector_centrality_target", data=paths, ax=axs[0])
sns.pointplot(x="finished", y="in_degree_target", data=paths, ax=axs[1])

Errorbar 

In [None]:
stats_by_year = df.groupby(df['year']).apply(lambda x: pd.Series({
        'average_worldwide_gross': x['worldwide_gross'].mean(),
        'std_dev_worldwide_gross': x['worldwide_gross'].std()
    }))
plt.errorbar(df.index, df.average_worldwide_gross,
             yerr = df.std_dev_worldwide_gross,
             capsize= 3)
plt.xlabel('Year')
plt.ylabel('yoooo')

Log histogram and logog hist

In [None]:
array_100 = plt.hist(df['col'], bins=100,log=True, histtype='step')
plt.ylabel("yoooooo")
plt.xlabel("yoooooo")
plt.title("yoooooo")
plt.show()

plt.loglog(array_100[1][1:],array_100[0])
plt.ylabel("yoooooo")
plt.xlabel("yoooooo")
plt.title("yoooooo")
plt.show()

df.plot.hist(column=["Frequency"], loglog=True, bins=np.logspace(0, 6, 100),
                           title="yooooooooo (loglog scale)");

Subplots

In [None]:
# Horizontal 
fig, axs = plt.subplots(1, 3, figsize=(12,3))

tmp_a = df.count().reset_index().groupby("YEA").VOT.count() 
axs[0].plot(tmp_a ) 
axs[0].scatter(tmp_a.index, tmp_a) 
axs[0].set_title("a) # RfA")
axs[0].set_xlabel("Year")
print("a) # RfA 2008:", tmp_a.loc[2008])

tmp_b = df.groupby(["YEA"]).VOT.apply(lambda x: np.mean(x > 0))
axs[1].plot(tmp_b ) 
axs[1].scatter(tmp_b.index, tmp_b) 
axs[1].set_title("b) % Positive votes")
axs[1].set_xlabel("Year")
print("b) % Positive votes:", tmp_b.loc[2008])

tmp_c = df.count().reset_index().groupby("YEA").VOT.mean()
axs[2].plot(tmp_c ) 
axs[2].scatter(tmp_c.index, tmp_c) 
axs[2].set_title("c) Avg. # votes per RfA")
axs[2].set_xlabel("Year");

# Vertical 
fig, axs = plt.subplots(3, sharex=True)
axs[0].plot(tmp_a)
axs[0].set_title("count_rfa_by_year")
axs[1].plot(tmp_b)
axs[1].set_title("frac_pos_votes_by_year")
axs[1].set_ylim([0, 1])
axs[2].plot(tmp_c)
axs[2].set_title("avg_vote_count_by_year_and_rfa")
plt.xlabel("Year")
fig.tight_layout()

Complementary continous ditribution function CCDF

In [None]:
sns.ecdfplot(list(dict(G.degree()).values()), complementary=True)
plt.xscale("log")
plt.axvline(10) # plot vertical line 
plt.axhline(0.4)
plt.title("ComplementaryCDF")
plt.xlabel("Degree_centrality")

Gros petage de cable, matrice de subplot

In [None]:
fig, axes = plt.subplots(2, 2, figsize = (15,10),gridspec_kw={'hspace': 0.4, 'wspace': 0.2})
fig.suptitle("In and out degree distribution of the Wikispeedia Network (Left: linear axes, Right: Log axes)", fontsize=20)
axes[0][0].plot(range(len(in_degree_freq)), in_degree_freq, 'go-', label='In-degree') # list
axes[0][1].loglog(range(len(in_degree_freq)), in_degree_freq, 'go-', label='In-degree') 
axes[1][0].plot(range(len(out_degree_freq)), out_degree_freq, 'bo-', label='Out-degree')
axes[1][1].loglog(range(len(out_degree_freq)), out_degree_freq, 'bo-', label='Out-degree')
for x in range(0,axes.shape[0]):
    for y in range(0,axes.shape[1]):
        axes[x,y].set_xlabel('Degree', fontsize = 20)
        axes[x,y].set_ylabel('Frequency', fontsize = 20)
        axes[x,y].legend(fontsize=15)
fig.subplots_adjust(top=0.94)
plt.show()

### 3. Machine learning

Split dataframe

In [None]:
def split_set(data_to_split, ratio=0.8):
    mask = np.random.rand(len(data_to_split)) < ratio
    return [data_to_split[mask].reset_index(drop=True), data_to_split[~mask].reset_index(drop=True)]
[train, test] = split_set(df)

In [None]:
from sklearn.model_selection import train_test_split

# FIRST OPTION
X = df.drop(columns="view_count")
y = df["view_count"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42)

# SECOND OPTION 
train, test = train_test_split(df, test_size=0.3, random_state=42)

# THIRD OPTION : create numpy arrays
X = top5_articles_content['content'].to_numpy()
y = top5_articles_content['labels'].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

ROC/AUC score

In [None]:
from sklearn.metrics import roc_auc_score

y_pred = (df.PP + df.NN >= df.PN).astype(int) # convert boolean to int
print("A) roc score", roc_auc_score(y, y_pred))

Ridge regression

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error

ridge = Ridge()
ridge_hyper = {'alpha':(0.001, 0.01, 0.1)}
ridge_cv = GridSearchCV(ridge, ridge_hyper, cv=3)
ridge_cv.fit(X_train, y_train)

ridge_cv.cv_results_['mean_test_score']

mean_absolute_error(y_test, ridge_cv.predict(X_test))

TF-IDF transformation + SGDClassifier

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split

vectorizer = TfidfVectorizer( max_features=150, stop_words="english")
X = vectorizer.fit_transform(df.TXT_PROCESSED.values).toarray()
X2 = np.hstack((X, df[["PP", "NN", "PN", "N", "P"]].values))
y =  (df.VOT == 1).values.astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
clf = SGDClassifier(random_state=0, loss=loss_v).fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_pred = clf.predict_proba(X_test)[:,1]
print("(with predict_proba)", roc_auc_score(y_test, y_pred))

TF-IDF matrix + MultiClass Classifier with cross validation CV

In [None]:
parameters = {
    'clf__alpha': [1e-4],    
}

text_clf = Pipeline([                   # utility that helps automate machine learning workflows
    ('vect', CountVectorizer()),        # converts a collection of text documents to a matrix of token counts
    ('tfidf', TfidfTransformer()),      # transforms the count matrix from CountVectorizer to a normalized tf-idf representation
    ('clf', SGDClassifier(penalty='l2', loss='log', max_iter=5, tol=None, random_state=42)) # class_weight='balanced'
])                                      # linear classifier (SVM, logistic regression, etc.) with SGD training

# GridSearchCV is applied to the pipeline with the defined parameters. 
# It will conduct a grid search over the parameter space using 5-fold cross-validation
gs_clf = GridSearchCV(text_clf, parameters, cv=5)
gs_clf = gs_clf.fit(X_train, y_train)   # fits the grid search model to the training data

# best score achieved across all parameter combinations in the grid search
print(gs_clf.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

# grid search object gs_clf is used to make predictions on the test data
predicted = gs_clf.predict(X_test)
print("Accuracy on Test Data: ", np.mean(predicted == y_test))

# In a 5-class classification a random model obtains an accuracy of 20% in expectation. 
# Thus, obtaining an accuracy in high 80s is a strong outcome.

# List of top 10 words sorted in descending order of importance based on feature weights earned by the classifier
top5_coeff_indices = np.argsort(gs_clf.best_estimator_.named_steps['clf'].coef_)[:,-10:][:,::-1]
np.array(gs_clf.best_estimator_.named_steps['vect'].get_feature_names())[top5_coeff_indices]

Metrics of accuracy and scores

In [None]:
# serie Applied ML
def compute_confusion_matrix(true_label, prediction_proba, decision_threshold=0.5): 
    
    predict_label = (prediction_proba[:,1]>decision_threshold).astype(int)   
                                                                                                                       
    TP = np.sum(np.logical_and(predict_label==1, true_label==1))
    TN = np.sum(np.logical_and(predict_label==0, true_label==0))
    FP = np.sum(np.logical_and(predict_label==1, true_label==0))
    FN = np.sum(np.logical_and(predict_label==0, true_label==1))
    
    confusion_matrix = np.asarray([[TP, FP],
                                    [FN, TN]])
    return confusion_matrix

def compute_confusion_mat(true_label, predict_label): 
                                                                                                                       
    TP = np.sum(np.logical_and(predict_label==1, true_label==1))
    TN = np.sum(np.logical_and(predict_label==0, true_label==0))
    FP = np.sum(np.logical_and(predict_label==1, true_label==0))
    FN = np.sum(np.logical_and(predict_label==0, true_label==1))
    
    confusion_matrix = np.asarray([[TP, FP],
                                    [FN, TN]])
    return confusion_matrix

def compute_all_score(confusion_matrix, t=0.5):
    [[TP, FP],[FN, TN]] = confusion_matrix.astype(float)
    
    accuracy =  (TP+TN)/np.sum(confusion_matrix)
    
    precision_positive = TP/(TP+FP) if (TP+FP) !=0 else np.nan
    precision_negative = TN/(TN+FN) if (TN+FN) !=0 else np.nan
    
    recall_positive = TP/(TP+FN) if (TP+FN) !=0 else np.nan
    recall_negative = TN/(TN+FP) if (TN+FP) !=0 else np.nan

    F1_score_positive = 2 *(precision_positive*recall_positive)/(precision_positive+recall_positive) if (precision_positive+recall_positive) !=0 else np.nan
    F1_score_negative = 2 *(precision_negative*recall_negative)/(precision_negative+recall_negative) if (precision_negative+recall_negative) !=0 else np.nan

    return [t, accuracy, precision_positive, recall_positive, F1_score_positive, precision_negative, recall_negative, F1_score_negative]


from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score, balanced_accuracy_score

confusion_matrix(y_test_small, predicted_small)                 # Compute confusion matrix to evaluate the accuracy of a classification.
balanced_accuracy_score(y_test_small, predicted_small)          # The balanced accuracy in binary and multiclass classification problems to deal with imbalanced datasets. It is defined as the average of recall obtained on each class.
                                                                # The best value is 1 and the worst value is 0
classification_report(y_test_small, predicted_small, digits=3)  # Build a text report showing the main classification metrics

Logistic regression with cross Validation CV

In [None]:
Cs = (1, 10, 100)
log_reg_cv = LogisticRegressionCV(Cs=Cs, cv=3, random_state=42, max_iter=200)

log_reg_cv.fit(X_train, y_train_binary)
opt_C = log_reg_cv.C_[0]
opt_C

log_reg_cv.scores_[1].mean(axis=0)
log_reg_cv.score(X_test, y_test_binary)

Logistic regression classifier using sklean

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X = TFIDF_questions
y = df["gender"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.40, random_state=42)

log_reg = LogisticRegression(C=10, random_state=42, max_iter=2000)

log_reg.fit(X_train, y_train)
log_reg.score(X_test, y_test)
y_pred = model.predict(X_test)
print(s, ":", "accuracy", accuracy_score(y_test, y_pred))

Logistic regression using statsmodels

In [None]:
import statsmodels.formula.api as smf

df["VOT2"] = (df.VOT == 1).values.astype(int)
smf.logit("VOT2 ~ PP + NN +PN" , data=df).fit().summary()

Linear regression 

In [None]:
import statsmodels.formula.api as smf

mod = smf.ols(formula='time ~ C(high_blood_pressure) * C(DEATH_EVENT,  Treatment(reference=0)) + C(diabetes)',
              data=df)
res = mod.fit()
print(res.summary())

# PREPARE THE MODEL
# - Equations are specified using patsy formula syntax. Important operators are:
#     1. `~` : Separates the left-hand side and right-hand side of a formula.
#     2. `+` : Creates a union of terms that are included in the model.
#     3. `:` : Interaction term.
#     3. `*` : `a * b` is short-hand for `a + b + a:b`, and is useful for the common case of wanting to include all interactions between a set of variables.
# - Intercepts are added by default.
# - Categorical variables can be included directly by adding a term C(a).

# ANALYSE THE SUMURY
# - The dependent variable : time (number of days at the hospital)
# - Method: The type of model that was fitted (OLS)
# - Nb observations: The number of datapoints (299 patients)
# - R2: The fraction of explained variance
# - A list of predictors
# - For each predictor: coefficient, standard error of the coefficients, p-value, 95% confidence intervals. We can see that only high blood pressure is a significant predictor (p = 0.001), while diabetes is not (0.584).
# - Warnings if there are numerical issues (hopefully not!)

# C(diabetes)[T.1]: This coefficient represents the change in the dependent variable for individuals with diabetes (1), compared to those without (0), all else being equal.

# si P>|t| > 0.05, alors c'est PAS STATS SIGNIFICANT : on rejette l'hypothese que c'est important
# si c'est True/ False => analyse that if it becomes True it gains that amount in that way!

### 3Bis. Unsupervised learning : K-Means

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

### How to select K in K-Means?  You have a couple of options:

# - Silhouette score: Find the K with the desired tradeoff between the number of clusters and cohesion/separation.
silhouettes = []

# Try multiple k
for k in range(2, 11):
    # Cluster the data and assigne the labels
    labels = KMeans(n_clusters=k, random_state=10).fit_predict(X)
    # Get the Silhouette score
    score = silhouette_score(X, labels)
    silhouettes.append({"k": k, "score": score})
    
# Convert to dataframe
silhouettes = pd.DataFrame(silhouettes)

# Plot the data
plt.plot(silhouettes.k, silhouettes.score)
plt.xlabel("K")
plt.ylabel("Silhouette score")

# - Elbow method: Find the "elbow" in the curve of the Sum of Squared Errors
def plot_sse(features_X, start=2, end=11):
    sse = []
    for k in range(start, end):
        # Assign the labels to the clusters
        kmeans = KMeans(n_clusters=k, random_state=10).fit(features_X)
        sse.append({"k": k, "sse": kmeans.inertia_})

    sse = pd.DataFrame(sse)
    # Plot the data
    plt.plot(sse.k, sse.sse)
    plt.xlabel("K")
    plt.ylabel("Sum of Squared Errors")
    
plot_sse(X)

# Plot the results 
fig, axs = plt.subplots(1, 1, figsize=(4,4), sharey=True)

# Plot the clusters with K = 3
labels = KMeans(n_clusters=3, random_state=0).fit_predict(X)
axs.scatter(X[:,0], X[:,1], c=labels, alpha=0.6)

# See notebook for reduced PCA or t-SNE, and DBSCAN

### 4. Networks

In [1]:
# All graph types (Graph, DiGraph, MultiGraph, MultiDiGraph) and their methods:
# https://networkx.org/documentation/stable/reference/classes/index.html 

Graph initialization

In [None]:
# Check the type of graph !! https://networkx.org/documentation/stable/reference/classes/index.html  
G =nx.from_pandas_edgelist(edges, 'ColSource', 'ColTarget', edge_attr=None, create_using= nx.Graph()) 
nx.set_node_attributes(G, df['Role'].to_dict(), 'Role' )
# OR 
betweenness = nx.betweenness_centrality(G)
nx.set_node_attributes(G, betweenness, 'betweenness')

# thinks of in degree and out_degree in Di graph and multiDigraph !!!
tmp = sorted(dict(G.out_degree()).values())
in_degree = pd.Series(dict(G.in_degree()))

In [None]:
G = nx.MultiDiGraph()
edge_list = pd.read_csv("./data/part-1/edgelist.tsv", sep="\t")
node_list = pd.read_csv("./data/part-1/nodelist.tsv", sep="\t")

# Creates node attributes
for _, node in node_list.iterrows():
    node = dict(node)
    G.add_node(node['u'], score=node['score'], name=node['name'])

# Creates edge attributes
for _, edge in edge_list.iterrows():
    edge = dict(edge)
    G.add_edge(edge['u'], edge['v'], gender=edge['gender'])

for u, v, k in G.edges: # because Multi(Di)Graph !
    # do what you want to do

Graph description

In [None]:
def describe_graph(G):
    print(G)
    print(f"There are {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")
    if nx.is_connected(G):
        print("Avg. Shortest Path Length: %.4f" %nx.average_shortest_path_length(G))
        print("Diameter: %.4f" %nx.diameter(G)) # Longest shortest path
    else:
        print("Graph is not connected")
        print("Diameter and Avg shortest path length are not defined!")
    print("Sparsity: %.4f" %nx.density(G))  # #edges/#edges-complete-graph
    # #closed-triplets(3*#triangles)/#all-triplets
    print("Global clustering coefficient aka Transitivity: %.4f" %nx.transitivity(G))

def describe_digraph(G):
    print(G)
    print(f"There are {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")
    print(f"Average in-degree: {sum(d for n, d in G.in_degree()) / float(G.number_of_nodes())}")
    print(f"Average out-degree: {sum(d for n, d in G.out_degree()) / float(G.number_of_nodes())}")

    if nx.is_strongly_connected(G):
        print("Avg. Shortest Path Length: %.4f" % nx.average_shortest_path_length(G))
        print("Diameter: %.4f" % nx.diameter(G))  # Longest shortest path
    else:
        print("DiGraph is not strongly connected")
        print("Diameter and Avg shortest path length are not defined!")

    print("Sparsity: %.4f" % nx.density(G))  
    print("Global clustering coefficient aka Transitivity: %.4f" % nx.transitivity(G))

def describe_multigraph(G):
    print(G)
    print(f"There are {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")
    
    if nx.is_connected(G):
        print("Avg. Shortest Path Length: %.4f" % nx.average_shortest_path_length(G))
        print("Diameter: %.4f" % nx.diameter(G))  # Longest shortest path
    else:
        print("MultiGraph is not connected")
        print("Diameter and Avg shortest path length are not defined!")

    print("Sparsity: %.4f" % nx.density(G))  
    print("Global clustering coefficient aka Transitivity: %.4f" % nx.transitivity(G))

def describe_multidigraph(G):
    print(G)
    print(f"There are {G.number_of_nodes()} nodes and {G.size()} edges.")
    print("Avg In-Degree: %.4f" % (sum(d for n, d in G.in_degree()) / float(G.number_of_nodes())))
    print("Avg Out-Degree: %.4f" % (sum(d for n, d in G.out_degree()) / float(G.number_of_nodes())))

    # Checking for strong connectivity
    if nx.is_strongly_connected(G):
        print("Avg. Shortest Path Length: %.4f" % nx.average_shortest_path_length(G))
        print("Diameter: %.4f" % nx.diameter(G))  # Longest shortest path in a strongly connected component
    else:
        print("MultiDiGraph is not strongly connected")
        print("Diameter and Avg shortest path length are not defined for the whole graph!")


Subgraph view with edge

In [None]:
# Get all edges with a specific attribute (filter on value), store their keys, and then create a subgraph with them 
edges_2004 = [i for i, v in nx.get_edge_attributes(G, "YEA").items() if v == 2004]
G_2004 = G.edge_subgraph(edges_2004)

idx = 0
tmp = []
for i in nx.enumerate_all_cliques(G_2004):
    if len(i) < 3: 
        continue
    if len(i) > 3:
        break
    idx += 1
    tmp.append(i)

# Mine
df_2004 = df[df['YEA']==2004]
edges_of_2004 = [(x['SRC'], x['TGT']) for i, x in df_2004.iterrows()] # create list of tupe, ie list of edges
H = nx.edge_subgraph(G, edges_of_2004)
cliques = nx.enumerate_all_cliques(H)
triangles = list(filter(lambda x: len(x)==3, cliques))

In [None]:
# Get indegree and outdegree distributions
indegree = []
outdegree = []
for node in G.nodes:
    indegree.append(len(list(G.predecessors(node))))
    outdegree.append(len(list(G.successors(node))))
indegree = np.array(indegree)
outdegree = np.array(outdegree)

indegree = np.array(sorted(indegree/sum(indegree), reverse=True)).cumsum()
outdegree = np.array(sorted(outdegree/sum(outdegree), reverse=True)).cumsum()
# See ecfplot

Avoir le nombre de connected components et leur description

In [None]:
# Methode 1
nx.number_connected_components(G)

# Methode 2
comp = list(nx.connected_components(G))
print('The graph contains', len(comp), 'connected components')

# Methode 3
[len(c) for c in sorted(nx.connected_components(G), key=len, reverse=True)] # list the sizes of all connected components in G from the largest to smallest

# Analyse a multigraph
# Graph.subgraph(nodes)[source] # Returns a SubGraph view of the subgraph induced on nodes.
print(nx.is_strongly_connected(G))
print(nx.number_strongly_connected_components(G))
components = [c for c in sorted(nx.strongly_connected_components(G), key=len, reverse=True)]
print(len(components[0]))

# Number of nodes and edge in the largest connected component
largest_cc = max(nx.weakly_connected_components(G), key=len)
H = G.subgraph(list(largest_cc))
print(len(H.nodes()), len(H.edges()), len(H.edges())/len(H.nodes()))

Shortest path between two nodes

In [None]:
fell_whitehead_path = nx.shortest_path(G, source="Margaret Fell", target="George Whitehead")

Compute and print betweenness centrality

In [None]:
betweenness = nx.betweenness_centrality(G)
nx.set_node_attributes(G, betweenness, 'betweenness')
sorted_betweenness = sorted(betweenness.items(), key=itemgetter(1), reverse=True)

for index, bw in sorted_betweenness[:2]:
    channel_name = G.nodes[index]['channel']
    print(index, 'who is', G.nodes[index]['channel'], 'in category',  G.nodes[index]['category'], 'has betweeness: %.3f' %bw)

Histogram of in_degree/out_degree of a DiGraph

In [None]:
def degree_histogram_directed(G, in_degree=False, out_degree=False):
    nodes = G.nodes()
    if in_degree:
        in_degree = dict(G.in_degree())
        degseq=[in_degree.get(k,0) for k in nodes]
    elif out_degree:
        out_degree = dict(G.out_degree())
        degseq=[out_degree.get(k,0) for k in nodes]
    else:
        degseq=[v for k, v in G.degree()]
    dmax=max(degseq)+1
    freq= [ 0 for d in range(dmax) ]
    for d in degseq:
        freq[d] += 1
    return freq

in_degree_freq = degree_histogram_directed(G, in_degree=True)
out_degree_freq = degree_histogram_directed(G, out_degree=True)
degrees = range(len(in_degree_freq))
# see the petage de cable

Build a dataframe from a graph

In [None]:
gender_dict = nx.get_edge_attributes(G, "gender")
df = []
for u, v, k in G.edges: #multigraph
    df.append(
    {   
        "gender": gender_dict[(u,v,k)] == "F",
        "d": scores_dict[v] - scores_dict[u],
        "q": scores_dict[u],
    }
    )
df = pd.DataFrame(df)

### 5. Statistics

Bootstrap

In [None]:
from sklearn.metrics import roc_auc_score

# 1e methode
def bootstrap95(df):
    rocauc_dif=[]
    for i in range(200):
        df_sampled = df.sample(frac = 1, replace = True)

        y_true = df_sampled['VOT']
        y_SBT_clas = df_sampled['SBT_clas']
        y_weak_SBT_clas = df_sampled['weak_SBT_clas']

        val = roc_auc_score(y_true, y_SBT_clas)-roc_auc_score(y_true, y_weak_SBT_clas)
        rocauc_dif.append(val.mean())

    print("95% CI:", np.quantile( np.array(rocauc_dif), q=[0.025, 0.975]))
    return np.quantile( np.array(rocauc_dif), q=[0.025, 0.975])

CI_roc_dif = bootstrap95(df)

# 2e methode
def do_bootstrap(data, n=1000): # 95%
    sample_statistic = [] 
    for _ in range(n):
        sampled_data = np.random.choice(data, size=len(data))  
        sample_statistic.append(np.mean(sampled_data))
    return (np.percentile(sample_statistic, 2.5), np.percentile(sample_statistic, 97.5))

do_bootstrap(howto_perchannel_new)

# no overlap to have significant one better than another !

Statistical tests

In [None]:
import scipy.stats as stats

mean_F = df[df['gender']=='F']['score_gain']
mean_M = df[df['gender']=='M']['score_gain']
stats.ttest_ind(mean_F, mean_M)

stats.spearmanr(df.similarity, df.ranking) # get correlation!!
# If pvalue>0.05, then it has no significant correlation

### 6. Analysis

Networkx

Average degree is not recommended as the degree distribution of real-world networks usually follows a powerlaw. Summarizing powerlaws with average values is not a good idea, as there is a long tail, and there are many nodes that have very high degree. Instead, median is a better choice.

TF-IDF regularization

We are performing classification tasks in spase matrices where the number of features outnumber the number of datapoints. Thus, the lack of regularization can lead to overfitting. When we increase C we decrease the regularization penalty and thus increase overfitting.

### 7. Spark

In [None]:
import pyspark

reddit = spark.read.json("messages.json.gz")
reddit.printSchema()

# Process a pyspark dataframe
subreddit_info = reddit.groupBy("subreddit")\
    .agg(count("*").alias("total_posts"), 
         countDistinct("author").alias("users_count"),
         avg(length("body")).alias("posts_length"), # get avg len of a string
         avg("score")).alias("posts_length"),   # get avg of a score in a col
         stddev(length("body")).alias("posts_length_stddev")
        ).cache()

# Comvert from PySpark dataframe to Panda dataframe
by_posts = subreddit_info.select("subreddit", "total_posts")\
    .sort(col("total_posts").desc())\
        .limit(50000)\
            .toPandas()

# ex 3
subreddits_by_pl = subreddit_info.toPandas()\
    .sort_values("posts_length", ascending=False)\
        .reset_index(drop=True)

# Convert the pandas DataFrame to a PySpark DataFrame
spark_df = spark.createDataFrame(pandas_df)

# from dataframe df to spark rdd
subreddit_50k = filtered_tokens.rdd.map(lambda r: (r.subreddit, [r.word])).reduceByKey(lambda a,b: a+b).collect()

### 8. Text

Clean text

In [None]:
def clean_line(line):
    for char in EXCLUDE_CHARS:
        line = line.replace(char, ' ')
    return line.lower()

df["Line"] = df["Line"].apply(clean_line)
df.head()

Count nb of tokens

In [None]:
lines["Words"] = lines["Line"].apply(lambda x: len(x.split()))

### 9. Matching

In [None]:
# one feature is still very desequlibrate
# we should force equilibrate it 

G = nx.Graph()
for control_idx, control_row in df_notreat.iterrows():
    for treat_idx, treat_row in df_treat.iterrows():

        control_person = pd.DataFrame([control_row])
        treated_person = pd.DataFrame([treat_row])
        similarity = abs(res.predict(control_person) - res.predict(treated_person))
        
        if feature1 == feature2: 
            G.add_weighted_edges_from([(control_row, treat_row, similarity)])

# A matching is a subset of edges in which no node occurs more than once. The weight of a matching is the sum of the weights of its edges. A maximal matching cannot add more edges and still be a matching. The cardinality of a matching is the number of matched edges.
matching = nx.max_weight_matching(G)

In [None]:
G_obs = nx.Graph()
vs = set(list(zip(paths['source'].values, paths['shortest_path_length'].values)))
match_set = set()
max_counter = len(vs)
counter = 0

# iterate over a zip of the two features we want to nmatch (en qques sortes)
for source, min_dist in vs:
    counter += 1
    if counter % 1000 == 0:
        print(counter/max_counter)

    high_indegree = paths[(paths['in_degree_binary_target'] == True) & 
                     (paths['source'] == source) &
                     (paths['shortest_path_length'] == min_dist)]

    low_indegree = paths[(paths['in_degree_binary_target'] == False) & 
                       (paths['source'] == source) &
                       (paths['shortest_path_length'] == min_dist)]
    
    for i, f in zip(high_indegree.index, high_indegree['target']):
        for j, u in zip(low_indegree.index, low_indegree['target']):
            if f != u:
                G_obs.add_edge(i, j)
                match_set.add((i,j))

# A matching is a subset of edges in which no node occurs more than once. A maximal matching cannot add more edges and still be a matching.
matching = nx.maximal_matching(G_obs)
print(f'#Matched pairs: {len(matching)}')

#
high_games_match_cands = {}; low_games_match_cands = {}
for (u,v) in match_set:
    if u not in high_games_match_cands:
        high_games_match_cands[u] = [v]
    else:
        high_games_match_cands[u].append(v)
    if v not in low_games_match_cands:
        low_games_match_cands[v] = [u]
    else:
        low_games_match_cands[v].append(u)

# 
print(len(high_games_match_cands), len(low_games_match_cands))
set_low_games_match_cands = set(low_games_match_cands.keys())
set_high_games_match_cands = set(high_games_match_cands.keys())