In [11]:
pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


# Data Loading and Initial Exploration

In [21]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

def load_data(dataset_query_path, search_results_paths):
    """
    Loads and merges the query dataset with multiple search results datasets.
    
    Parameters:
    - dataset_query_path: The file path to the dataset containing the queries and the correct URLs.
    - search_results_paths: A list of file paths to the datasets containing search results.
    
    Returns:
    - A merged DataFrame containing all the data.
    """
    # Load the dataset containing queries and correct URLs
    dataset_incl_query = pd.read_csv(dataset_query_path)
    dataset_incl_query.drop(columns=['Unnamed: 0'], inplace=True)  # Drop unnecessary columns
    
    # Initialize a DataFrame to hold the merged search results
    merged_search_results = pd.DataFrame()
    
    # Load and concatenate each search results dataset
    for path in search_results_paths:
        search_results = pd.read_csv(path)
        # Optionally preprocess each search_results here (e.g., rename columns)
        # For simplicity, assuming the columns are consistently named
        if merged_search_results.empty:
            merged_search_results = search_results
        else:
            # Ensure that the DataFrame structures align before concatenating
            merged_search_results = pd.concat([merged_search_results, search_results], axis=0, ignore_index=True)
    
    # Merge the query dataset with the concatenated search results
    merged_dataset = pd.merge(dataset_incl_query, merged_search_results, on='EntityNumber', how='left')
    
    return merged_dataset

# -----------------------------------------------------------------------------------------------
# Path to the dataset containing queries and correct URLs
dataset_query_path = 'dataset_incl_query.csv'

# List of paths to the search result datasets
search_results_paths = [
    'search_results_DDG.csv'
    # Created for each search dataset seperetaly otherwise duplicates
]
merged_dataset = load_data(dataset_query_path, search_results_paths)

print(merged_dataset.head())


   EntityNumber                                       OfficialName  ZipCode  \
0  0201.310.929                                                IGL     3600   
1  0202.239.951                                           PROXIMUS     1030   
2  0203.201.340                          Nationale Bank van België     1000   
3  0206.460.639  Intergemeentelijk Samenwerkingsverband van het...     9100   
4  0206.653.946  Rijksinstituut voor Ziekte- en Invaliditeitsve...     1210   

          Municipality                Street HouseNumber  \
0                 Genk            Klotstraat         125   
1           Schaarbeek  Koning AlbertII laan          27   
2              Brussel     de Berlaimontlaan          14   
3         Sint-Niklaas             Lamstraat         113   
4  Sint-Joost-ten-Node           Galileelaan           5   

                      URL                                        SearchQuery  \
0  extranet.iglimburg.be/                                      IGL 3600 Genk   
1   

# Standerdize URLs

In [22]:
import pandas as pd
from urllib.parse import urlparse
import re

def clean_and_standardize_url(url, is_main_url=False):
    """
    Cleans and standardizes URLs with specific handling for the main URL column.
    
    Parameters:
    - url: The URL to be cleaned and standardized.
    - is_main_url: Flag indicating if the URL is from the main 'URL' column.
    
    Returns:
    - The cleaned and standardized URL.
    """
    if pd.isna(url):
        return url if is_main_url else None  # Keep as-is for main URL column if NaN
    
    # Ensure all URLs are strings and lowercase
    url = str(url).lower()
    
    # Standardizing starts here
    # Removing protocols and leading www.
    url = re.sub(r'^(?:http:\/\/|https:\/\/)?(?:www\.)?', '', url)
    
    # Remove all leading and trailing special characters or slashes
    url = url.strip("/").strip()
    
    # Limit to the domain and up to two path segments, if present
    segments = url.split('/')
    if len(segments) > 3:  # Keep only up to two path segments
        url = '/'.join(segments[:3])
    
    # Removing trailing slash if it's not part of a protocol
    url = url.rstrip('/')
    
    # Re-adding "www." at the beginning
    standardized_url = f"www.{url}"
    
    return standardized_url

# Apply the function
columns_to_standardize = ['URL'] + [f'URL{i}' for i in range(1, 6)]
for col in columns_to_standardize:
    is_main_url = col == 'URL'
    merged_dataset[col] = merged_dataset[col].apply(lambda x: clean_and_standardize_url(x, is_main_url))


print(merged_dataset.head(50))

    EntityNumber                                       OfficialName  ZipCode  \
0   0201.310.929                                                IGL     3600   
1   0202.239.951                                           PROXIMUS     1030   
2   0203.201.340                          Nationale Bank van België     1000   
3   0206.460.639  Intergemeentelijk Samenwerkingsverband van het...     9100   
4   0206.653.946  Rijksinstituut voor Ziekte- en Invaliditeitsve...     1210   
5   0206.731.645                 Rijksdienst voor Sociale Zekerheid     1060   
6   0206.732.437   Hulpkas voor Ziekte- en Invaliditeitsverzekering     1000   
7   0206.732.536              Hulpkas voor Werkloosheidsuitkeringen     1210   
8   0206.732.932               Rijksdienst voor Jaarlijkse Vakantie     1000   
9   0206.733.229                              Nationale Arbeidsraad     1040   
10  0206.734.318           Federaal agentschap voor beroepsrisico's     1210   
11  0206.737.484                Rijksdie

# Data preprocessing

In [31]:
from urllib.parse import urlparse
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

def extract_domain(url):
    """Extracts the domain from a URL."""
    parsed_url = urlparse(url if '://' in url else 'http://' + url)
    # Extract the domain
    domain = parsed_url.netloc
    if domain.startswith('www.'):
        domain = domain[4:]  # Remove 'www.' for uniformity
    return domain

def preprocess_for_model(merged_dataset):
    """
    Preprocesses the dataset for machine learning model training,
    focusing on domain matching for the correct URL prediction.
    """
    # Extract domains for comparison
    for col in ['URL'] + [f'URL{i}' for i in range(1, 6)]:
        merged_dataset[f'{col}_domain'] = merged_dataset[col].apply(extract_domain)
    
    # Create target variable based on domain match
    for i in range(1, 6):
        merged_dataset[f'URL{i}_correct'] = merged_dataset.apply(lambda x: x['URL_domain'] == x[f'URL{i}_domain'], axis=1)
    
    # Example of feature engineering with TF-IDF on the SearchQuery
    vectorizer = TfidfVectorizer()
    X_tfidf = vectorizer.fit_transform(merged_dataset['SearchQuery'])
    
    # This example stops at TF-IDF vectorization. Further steps would involve combining
    # these features with other relevant features for model training, depending on the
    # model and framework you're using.
    
    # Return preprocessed features and a sample target variable (for the first URL as an example)
    return X_tfidf, merged_dataset['URL1_correct']

# Assuming merged_dataset is your DataFrame prepared from earlier steps
X, y = preprocess_for_model(merged_dataset)


print(X,y)
print(merged_dataset.head(50))

  (0, 7218)	0.4509947493788615
  (0, 353)	0.45342065963165457
  (0, 8524)	0.7687739859359792
  (1, 14686)	0.47694584053606026
  (1, 12)	0.4780006825949594
  (1, 13643)	0.737589325189908
  (2, 3205)	0.2899131626550745
  (2, 7)	0.31079816024558937
  (2, 2319)	0.39921357596940016
  (2, 17093)	0.28257559196311793
  (2, 2076)	0.5331289894358568
  (2, 11950)	0.5439741702680454
  (3, 12137)	0.23801732309057483
  (3, 15102)	0.17912364308305875
  (3, 630)	0.2507782416817665
  (3, 17939)	0.29846222744045386
  (3, 10083)	0.38153783391104046
  (3, 8100)	0.2728564503435692
  (3, 14569)	0.4305218287719619
  (3, 8859)	0.4176949040197899
  (3, 17093)	0.42361917755792977
  (4, 12172)	0.27976061683747455
  (4, 16250)	0.27538120789598686
  (4, 9259)	0.2782524022019527
  (4, 37)	0.27976061683747455
  :	:
  (17222, 523)	0.5034460913303874
  (17223, 12549)	0.6451203009220621
  (17223, 2181)	0.4927016038705444
  (17223, 229)	0.41878937628172236
  (17223, 11164)	0.4070385549246988
  (17224, 11717)	0.549801215

# Statistics about dataframe

In [None]:
def analyze_false_ratings(dataframe):
    """
    Analyzes the false ratings in the URL(i)_correct columns and calculates how many entries
    have all false values for these columns.

    Parameters:
    - dataframe: The DataFrame containing the URL(i)_correct columns.

    Returns:
    - A summary dictionary with statistics about false ratings.
    """
    # Initialize a dictionary to store the results
    summary = {}
    
    # Columns to check for false ratings
    url_correct_cols = [f'URL{i}_correct' for i in range(1, 6)]
    
    # Calculate total number of false ratings per URL(i)_correct column
    for col in url_correct_cols:
        summary[col] = (dataframe[col] == False).sum()
    
    # Calculate how many entries have all URL(i)_correct columns as false
    dataframe['all_false'] = dataframe[url_correct_cols].apply(lambda row: all(~row), axis=1)
    summary['all_false_count'] = dataframe['all_false'].sum()
    
    return summary

# Assuming 'merged_dataset' is your DataFrame
false_ratings_summary = analyze_false_ratings(merged_dataset)

# Example: Print the summary statistics
for key, value in false_ratings_summary.items():
    print(f"{key}: {value}")


# Machine learning algorithms

# Logistic regression

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Select and train a model
# Initialize the Logistic Regression model
model = LogisticRegression(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, predictions)

print(f"Accuracy: {accuracy}")


Accuracy: 0.5414973882762624


# Evaluation

# Logistic Regression

In [33]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Assuming X and y are your features and target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Grid Search for Hyperparameter Tuning with Random Forest
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print(f"Best Parameters: {grid_search.best_params_}")
best_model = grid_search.best_estimator_

# Evaluate the best model
predictions = best_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, predictions)}")
