In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
from datetime import datetime
import pytz
from urllib.parse import urlparse
!pip install Faker

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Collecting Faker
  Downloading Faker-24.7.1-py3-none-any.whl.metadata (15 kB)
Downloading Faker-24.7.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m41.3 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: Faker
Successfully installed Faker-24.7.1


In [2]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/web-server-access-logs/access.log
/kaggle/input/web-server-access-logs/client_hostname.csv


In [3]:
# Define the log file path
log_file_path = '/kaggle/input/web-server-access-logs/access.log'

# Define the regex pattern to extract information from log lines
regex_pattern = r'^(?P<client>\S+) \S+ (?P<userid>\S+) \[(?P<datetime>[\w:/]+\s[+\-]\d{4})\] "(?P<method>[A-Z]+) (?P<request>[^ "]+)? HTTP/[0-9.]+" (?P<status>[0-9]{3}) (?P<size>[0-9]+|-) "(?P<referer>[^"]*)" "(?P<user_agent>[^"]*)"'

# Define the column names
columns = ['client', 'userid', 'datetime', 'method', 'request', 'status', 'size', 'referer', 'user_agent']

# Read the first 10000 rows of the log file into a list of dictionaries using regex pattern matching
log_data = []
with open(log_file_path, 'r') as file:
    for i, line in enumerate(file):
        if i >= 10000:
            break
        match = re.match(regex_pattern, line)
        if match:
            log_data.append({
                'client': match.group('client'),
                'userid': match.group('userid'),
                'datetime': match.group('datetime'),
                'method': match.group('method'),
                'request': match.group('request'),
                'status': match.group('status'),
                'size': match.group('size'),
                'referer': match.group('referer'),
                'user_agent': match.group('user_agent')
            })
        else:
            print("Error: Line does not match regex pattern:", line)

# Create DataFrame from the list of dictionaries
logs_df = pd.DataFrame(log_data, columns=columns)

In [4]:
# logs_df.head()
logs_df.head()

Unnamed: 0,client,userid,datetime,method,request,status,size,referer,user_agent
0,54.36.149.41,-,22/Jan/2019:03:56:14 +0330,GET,/filter/27|13%20%D9%85%DA%AF%D8%A7%D9%BE%DB%8C...,200,30577,-,Mozilla/5.0 (compatible; AhrefsBot/6.1; +http:...
1,31.56.96.51,-,22/Jan/2019:03:56:16 +0330,GET,/image/60844/productModel/200x200,200,5667,https://www.zanbil.ir/m/filter/b113,Mozilla/5.0 (Linux; Android 6.0; ALE-L21 Build...
2,31.56.96.51,-,22/Jan/2019:03:56:16 +0330,GET,/image/61474/productModel/200x200,200,5379,https://www.zanbil.ir/m/filter/b113,Mozilla/5.0 (Linux; Android 6.0; ALE-L21 Build...
3,40.77.167.129,-,22/Jan/2019:03:56:17 +0330,GET,/image/14925/productModel/100x100,200,1696,-,Mozilla/5.0 (compatible; bingbot/2.0; +http://...
4,91.99.72.15,-,22/Jan/2019:03:56:17 +0330,GET,/product/31893/62100/%D8%B3%D8%B4%D9%88%D8%A7%...,200,41483,-,Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16...


In [5]:
logs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   client      10000 non-null  object
 1   userid      10000 non-null  object
 2   datetime    10000 non-null  object
 3   method      10000 non-null  object
 4   request     10000 non-null  object
 5   status      10000 non-null  object
 6   size        10000 non-null  object
 7   referer     10000 non-null  object
 8   user_agent  10000 non-null  object
dtypes: object(9)
memory usage: 703.2+ KB


In [6]:
# Function to parse the datetime (from the class session practice exercise)
def parse_datetime(x):
    '''
    Parses datetime with timezone formatted as:
        `[day/month/year:hour:minute:second zone]`

    Example:
        `>>> parse_datetime('13/Nov/2015:11:45:42 +0000')`
        `datetime.datetime(2015, 11, 3, 11, 45, 4, tzinfo=<UTC>)`

    Due to problems parsing the timezone (`%z`) with `datetime.strptime`, the
    timezone will be obtained using the `pytz` library.
    '''
    try:
        dt = datetime.strptime(x[1:-7], '%d/%b/%Y:%H:%M:%S')
        dt_tz = int(x[-6:-3])*60+int(x[-3:-1])
        return dt.replace(tzinfo=pytz.FixedOffset(dt_tz))
    except ValueError:
        return '-'

In [7]:
logs_df['status'] = logs_df['status'].astype(int)
logs_df['size'] = logs_df['size'].astype(int)
logs_df['datetime'] = logs_df['datetime'].apply(parse_datetime)

In [8]:
logs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype                               
---  ------      --------------  -----                               
 0   client      10000 non-null  object                              
 1   userid      10000 non-null  object                              
 2   datetime    10000 non-null  datetime64[ns, pytz.FixedOffset(33)]
 3   method      10000 non-null  object                              
 4   request     10000 non-null  object                              
 5   status      10000 non-null  int64                               
 6   size        10000 non-null  int64                               
 7   referer     10000 non-null  object                              
 8   user_agent  10000 non-null  object                              
dtypes: datetime64[ns, pytz.FixedOffset(33)](1), int64(2), object(6)
memory usage: 703.2+ KB


## Dropping the userid column

- **Because it has one unique value which is just a hyphen**


In [9]:
users = logs_df['userid'].unique()
print(users)

logs_df.drop(columns=['userid'], inplace=True)

['-']


## Dropping duplicates


In [10]:
# Count duplicates in the dataframe
duplicate_count = logs_df.duplicated().sum()

# Display the count of duplicates
print("Number of duplicates:", duplicate_count)

Number of duplicates: 49


In [11]:
# Drop the duplicates
logs_df = logs_df.drop_duplicates()

In [12]:
logs_df.head()

Unnamed: 0,client,datetime,method,request,status,size,referer,user_agent
0,54.36.149.41,2019-01-02 03:56:01+00:33,GET,/filter/27|13%20%D9%85%DA%AF%D8%A7%D9%BE%DB%8C...,200,30577,-,Mozilla/5.0 (compatible; AhrefsBot/6.1; +http:...
1,31.56.96.51,2019-01-02 03:56:01+00:33,GET,/image/60844/productModel/200x200,200,5667,https://www.zanbil.ir/m/filter/b113,Mozilla/5.0 (Linux; Android 6.0; ALE-L21 Build...
2,31.56.96.51,2019-01-02 03:56:01+00:33,GET,/image/61474/productModel/200x200,200,5379,https://www.zanbil.ir/m/filter/b113,Mozilla/5.0 (Linux; Android 6.0; ALE-L21 Build...
3,40.77.167.129,2019-01-02 03:56:01+00:33,GET,/image/14925/productModel/100x100,200,1696,-,Mozilla/5.0 (compatible; bingbot/2.0; +http://...
4,91.99.72.15,2019-01-02 03:56:01+00:33,GET,/product/31893/62100/%D8%B3%D8%B4%D9%88%D8%A7%...,200,41483,-,Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16...


In [13]:
# from faker import Faker

# fake = Faker()

# def anonymize_user_agent(user_agent):
#     #Option 1: Generalization (example)
#     # new_user_agent = re.sub(r' .*$', '', user_agent)

#     # Option 2: Faker
#     #new_user_agent = fake.user_agent()
#     return new_user_agent

# logs_df['user_agent'] = logs_df['user_agent'].apply(anonymize_user_agent)

In [14]:
# logs_df.head()

## Anonymizing IPs with Reversible Encryption (Fernet)

In [15]:
from cryptography.fernet import Fernet

key = Fernet.generate_key()
cipher = Fernet(key)

In [16]:
# We have to securely store this key as it will be essential for decryption.
# decode will convert bytes to string

def encrypt_ip(ip_address):
    encrypted_ip = cipher.encrypt(ip_address.encode())
    return encrypted_ip.decode()  

def decrypt_ip(encrypted_ip):
    decrypted_ip_bytes = cipher.decrypt(encrypted_ip.encode())
    return decrypted_ip_bytes.decode()

In [17]:
anonymized_logs_df = logs_df.copy()
anonymized_logs_df['client'] = anonymized_logs_df['client'].apply(encrypt_ip)

In [18]:
anonymized_logs_df.head()

Unnamed: 0,client,datetime,method,request,status,size,referer,user_agent
0,gAAAAABmETvd1yJVFl3hjbaB09id-scOHXZA4FMZGIEsIR...,2019-01-02 03:56:01+00:33,GET,/filter/27|13%20%D9%85%DA%AF%D8%A7%D9%BE%DB%8C...,200,30577,-,Mozilla/5.0 (compatible; AhrefsBot/6.1; +http:...
1,gAAAAABmETvdBISmIXR6oOxwSZ2ESw_q70KH32aQBIoyj1...,2019-01-02 03:56:01+00:33,GET,/image/60844/productModel/200x200,200,5667,https://www.zanbil.ir/m/filter/b113,Mozilla/5.0 (Linux; Android 6.0; ALE-L21 Build...
2,gAAAAABmETvdsuLwc9J4CJ8pRH9ic3-PkjS8cfQTlWTUgR...,2019-01-02 03:56:01+00:33,GET,/image/61474/productModel/200x200,200,5379,https://www.zanbil.ir/m/filter/b113,Mozilla/5.0 (Linux; Android 6.0; ALE-L21 Build...
3,gAAAAABmETvdHrv2oubRgvi7cWJBOIsU3hm_SMxoP8UAsV...,2019-01-02 03:56:01+00:33,GET,/image/14925/productModel/100x100,200,1696,-,Mozilla/5.0 (compatible; bingbot/2.0; +http://...
4,gAAAAABmETvdePbLJZcTk70d--fbs8DEtAZ0mzVig-eZy-...,2019-01-02 03:56:01+00:33,GET,/product/31893/62100/%D8%B3%D8%B4%D9%88%D8%A7%...,200,41483,-,Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16...


In [19]:
# How to get back the correct IP

# Example IP address
test_ip = logs_df["client"][0]

# Encrypt
encrypted_ip = encrypt_ip(test_ip)
print("Encrypted IP:", encrypted_ip)

# Decrypt (using the same key)
original_ip = decrypt_ip(encrypted_ip)
print("Decrypted IP:", original_ip)

# Verify
assert test_ip == original_ip

Encrypted IP: gAAAAABmETveA22FRKqzi0S8Z4aW1cGrl9UJvUtHBwBpgKh5Ou8ok5TQY18qyGyOnUuyV34D_NfawGDg51XO5xmR31aRWCZoKw==
Decrypted IP: 54.36.149.41


In [20]:
referer = logs_df['referer'].unique()
print(len(referer))

319


In [21]:
print(len(logs_df['referer']))

9951


In [22]:
def generalize_user_agent(user_agent):
    # Extract browser and OS
    pattern = r"(?P<browser>^[^\s/]+/[^\s]+) \((?P<os>[^;]+);?"
    match = re.match(pattern, user_agent)
    if match:
        return f"{match.group('browser')} ({match.group('os')})"
    else:
        return "Other"

anonymized_logs_df['user_agent'] = anonymized_logs_df['user_agent'].apply(generalize_user_agent)

In [23]:
anonymized_logs_df.head()

Unnamed: 0,client,datetime,method,request,status,size,referer,user_agent
0,gAAAAABmETvd1yJVFl3hjbaB09id-scOHXZA4FMZGIEsIR...,2019-01-02 03:56:01+00:33,GET,/filter/27|13%20%D9%85%DA%AF%D8%A7%D9%BE%DB%8C...,200,30577,-,Mozilla/5.0 (compatible)
1,gAAAAABmETvdBISmIXR6oOxwSZ2ESw_q70KH32aQBIoyj1...,2019-01-02 03:56:01+00:33,GET,/image/60844/productModel/200x200,200,5667,https://www.zanbil.ir/m/filter/b113,Mozilla/5.0 (Linux)
2,gAAAAABmETvdsuLwc9J4CJ8pRH9ic3-PkjS8cfQTlWTUgR...,2019-01-02 03:56:01+00:33,GET,/image/61474/productModel/200x200,200,5379,https://www.zanbil.ir/m/filter/b113,Mozilla/5.0 (Linux)
3,gAAAAABmETvdHrv2oubRgvi7cWJBOIsU3hm_SMxoP8UAsV...,2019-01-02 03:56:01+00:33,GET,/image/14925/productModel/100x100,200,1696,-,Mozilla/5.0 (compatible)
4,gAAAAABmETvdePbLJZcTk70d--fbs8DEtAZ0mzVig-eZy-...,2019-01-02 03:56:01+00:33,GET,/product/31893/62100/%D8%B3%D8%B4%D9%88%D8%A7%...,200,41483,-,Mozilla/5.0 (Windows NT 6.2)


In [24]:
## Testing It

original_user_agent = logs_df['user_agent'][0]
generalized_user_agent = generalize_user_agent(original_user_agent)
print("Original User Agent:", original_user_agent)
print("Generalized User Agent:", generalized_user_agent)

Original User Agent: Mozilla/5.0 (compatible; AhrefsBot/6.1; +http://ahrefs.com/robot/)
Generalized User Agent: Mozilla/5.0 (compatible)


In [25]:
from urllib.parse import urlparse

def anonymize_referer(referer_url):
    if referer_url == '-':  # Handle cases with no referer
        return referer_url
    
    parsed_url = urlparse(referer_url)
    # Keep only scheme and netloc (domain), remove path and query string
    anonymized_url = parsed_url.scheme + "://" + parsed_url.netloc
    return anonymized_url

anonymized_logs_df['referer'] = anonymized_logs_df['referer'].apply(anonymize_referer)

In [26]:
anonymized_logs_df.head()

Unnamed: 0,client,datetime,method,request,status,size,referer,user_agent
0,gAAAAABmETvd1yJVFl3hjbaB09id-scOHXZA4FMZGIEsIR...,2019-01-02 03:56:01+00:33,GET,/filter/27|13%20%D9%85%DA%AF%D8%A7%D9%BE%DB%8C...,200,30577,-,Mozilla/5.0 (compatible)
1,gAAAAABmETvdBISmIXR6oOxwSZ2ESw_q70KH32aQBIoyj1...,2019-01-02 03:56:01+00:33,GET,/image/60844/productModel/200x200,200,5667,https://www.zanbil.ir,Mozilla/5.0 (Linux)
2,gAAAAABmETvdsuLwc9J4CJ8pRH9ic3-PkjS8cfQTlWTUgR...,2019-01-02 03:56:01+00:33,GET,/image/61474/productModel/200x200,200,5379,https://www.zanbil.ir,Mozilla/5.0 (Linux)
3,gAAAAABmETvdHrv2oubRgvi7cWJBOIsU3hm_SMxoP8UAsV...,2019-01-02 03:56:01+00:33,GET,/image/14925/productModel/100x100,200,1696,-,Mozilla/5.0 (compatible)
4,gAAAAABmETvdePbLJZcTk70d--fbs8DEtAZ0mzVig-eZy-...,2019-01-02 03:56:01+00:33,GET,/product/31893/62100/%D8%B3%D8%B4%D9%88%D8%A7%...,200,41483,-,Mozilla/5.0 (Windows NT 6.2)


In [27]:
## Testing It

original_referer = logs_df['referer'][1]
anonymized_referer = anonymize_referer(original_referer)
print("Original Referer:", original_referer)
print("Anonymized Referer:", anonymized_referer)

Original Referer: https://www.zanbil.ir/m/filter/b113
Anonymized Referer: https://www.zanbil.ir


In [28]:
# import hashlib

# def anonymize_ip(ip_address):
#     hashed_ip = hashlib.sha256(ip_address.encode()).hexdigest()
#     return hashed_ip

# logs_df['client'] = logs_df['client'].apply(anonymize_ip)

# 1. Is it possible to anonymize the dataset?

Yes, it is possible to anonymize the dataset to a significant extent using the techniques we've discussed. I have addressed potential PII in the client, user_agent, and referer columns by applying hashing, generalization, and URL modification.

# 2. Does it 'successfully' anonymize?

The success of anonymization depends on your definition of "success" and the level of privacy required for our specific use case.

  **Strengths:** I have removed direct identifiers like IP addresses and user IDs. We've also reduced the granularity of user agent and referer information.

  **Limitations:**

   - Reversible Encryption: The chosen method for client anonymization (Fernet) is reversible if the key is compromised. For stronger anonymization, i can use hashing or truncation techniques, but cryptographic hash functions like SHA-256 are one-way functions. This means that it is computationally infeasible to reverse the hashing process and retrieve the original IP address from the hashed value. (I have added above hashing function code. We can use that code if we can keep one lookup table to map)

   - User Agent Generalization: The level of generalization might still allow for some user profiling based on browser/OS combinations.

   - Referer Domain: Retaining the domain in the referer might reveal some information about user browsing behavior.

# 3. Ease of Using NLP:

The ease of using NLP varies depending on the specific task and libraries. Python offers user-friendly libraries like NLTK, spaCy, and transformers, which provide abstractions and pre-built models for common NLP tasks. However, effectively applying NLP often requires understanding the underlying concepts and tailoring techniques to your data and goals.

# 4. Does it make sense to use NLP?

In the context of this anonymization task, NLP techniques might not be directly applicable. However, NLP could be valuable in other scenarios involving text data, such as:

  - Identifying and redacting PII within text content (e.g., names, addresses)

  - Analyzing anonymized text data for insights while preserving privacy

# 5. Are the available libraries good enough?

Yes, there are many good libraries that we can use. Python's NLP ecosystem offers a wide range of powerful libraries with active communities. The choice of library depends on our specific needs and preferences. Here's a brief overview:

  - NLTK: Comprehensive library for fundamental NLP tasks and education.

  - spaCy: Efficient library for industrial-strength NLP pipelines and advanced tasks.

  - Transformers: State-of-the-art library for deep learning-based NLP models.

Additional Considerations:

  - Data Utility: Evaluate whether the anonymized data still meets your analytical requirements.

  - Privacy Regulations: Ensure compliance with relevant data privacy laws and guidelines.

  - Risk Assessment: Understand the potential re-identification risks and limitations of the anonymization methods used.

## UseCase of sPacy

In [29]:
import spacy

nlp = spacy.load("en_core_web_sm")
nlp_logs_df = logs_df.copy()

def anonymize_user_agent_nlp(user_agent):
    doc = nlp(user_agent)
    anonymized_tokens = []
    for token in doc:
        if token.pos_ in ["NOUN", "PROPN", "VERB"]:  # Target specific POS tags
            anonymized_tokens.append(token.lemma_)  # Use lemma for generalization
        else:
            anonymized_tokens.append("[UNK]")  # Replace other parts with generic token
    return " ".join(anonymized_tokens)

In [30]:
nlp_logs_df['user_agent'] = nlp_logs_df['user_agent'].apply(anonymize_user_agent_nlp)

In [31]:
nlp_logs_df.head()

Unnamed: 0,client,datetime,method,request,status,size,referer,user_agent
0,54.36.149.41,2019-01-02 03:56:01+00:33,GET,/filter/27|13%20%D9%85%DA%AF%D8%A7%D9%BE%DB%8C...,200,30577,-,Mozilla/5.0 [UNK] [UNK] [UNK] AhrefsBot/6.1 [U...
1,31.56.96.51,2019-01-02 03:56:01+00:33,GET,/image/60844/productModel/200x200,200,5667,https://www.zanbil.ir/m/filter/b113,Mozilla/5.0 [UNK] Linux [UNK] Android [UNK] [U...
2,31.56.96.51,2019-01-02 03:56:01+00:33,GET,/image/61474/productModel/200x200,200,5379,https://www.zanbil.ir/m/filter/b113,Mozilla/5.0 [UNK] Linux [UNK] Android [UNK] [U...
3,40.77.167.129,2019-01-02 03:56:01+00:33,GET,/image/14925/productModel/100x100,200,1696,-,Mozilla/5.0 [UNK] [UNK] [UNK] [UNK] [UNK] [UNK...
4,91.99.72.15,2019-01-02 03:56:01+00:33,GET,/product/31893/62100/%D8%B3%D8%B4%D9%88%D8%A7%...,200,41483,-,Mozilla/5.0 [UNK] Windows NT [UNK] [UNK] Win64...


In [32]:
test_user_agent = logs_df["user_agent"][0]
anonymized_user_agent = anonymize_user_agent_nlp(test_user_agent)
print("Original User Agent:", test_user_agent)
print("Anonymized User Agent:", anonymized_user_agent)

Original User Agent: Mozilla/5.0 (compatible; AhrefsBot/6.1; +http://ahrefs.com/robot/)
Anonymized User Agent: Mozilla/5.0 [UNK] [UNK] [UNK] AhrefsBot/6.1 [UNK] [UNK] http://ahrefs.com/robot/ [UNK]


#### Explanation of what i did:

- I have targeted specific parts of speech (nouns, proper nouns, verbs) within the user agent string.
- Lemmatization reduces these words to their base forms (e.g., "running" becomes "run"), providing a level of generalization while potentially preserving some semantic meaning.
- Other parts of speech are replaced with a generic "[UNK]" token.

#### Limitations and Considerations:

- Loss of Information: This approach significantly alters the user agent string, potentially impacting certain types of analysis that rely on specific browser or device details.
- Interpretability: The anonymized user agent strings become less human-readable.
- Alternative NLP Techniques:
    - Dependency Parsing: Analyze the grammatical structure of user agent strings to identify and redact specific components.
    - Custom Entity Recognition: Train a custom NER model to recognize and anonymize specific keywords or patterns within user agent strings.