In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('Downloads\\access_log.csv', delimiter='\t',header=None, names=['raw']) 

In [3]:
df

Unnamed: 0,raw
0,104.28.97.142 - - [06/May/2025:08:14:12 +0000]...
1,104.28.97.142 - - [06/May/2025:08:14:15 +0000]...
2,104.28.97.142 - - [06/May/2025:08:14:16 +0000]...
3,104.28.97.142 - - [06/May/2025:08:14:18 +0000]...
4,185.173.35.19 - - [06/May/2025:08:15:27 +0000]...
...,...
178,45.33.49.201 - - [06/May/2025:19:27:35 +0000] ...
179,157.240.195.61 - - [06/May/2025:19:32:18 +0000...
180,104.16.248.131 - - [06/May/2025:19:38:45 +0000...
181,104.16.248.131 - - [06/May/2025:19:38:47 +0000...


In [4]:
import re

pattern = re.compile(
    r'(?P<ip>\S+) - - \[(?P<time>.*?)\] "(?P<method>\S+) (?P<url>\S+) \S+" (?P<status>\d+) \S+ ".*?" "(?P<user_agent>.*?)"'
)

def parse_log(line):
    match = pattern.match(line)
    if match:
        return match.groupdict()
    else:
        return None

parsed = df['raw'].apply(parse_log)
parsed_df = pd.DataFrame([x for x in parsed if x is not None])

print(parsed_df.head())


              ip                        time method                      url  \
0  104.28.97.142  06/May/2025:08:14:12 +0000    GET                        /   
1  104.28.97.142  06/May/2025:08:14:15 +0000    GET     /assets/css/main.css   
2  104.28.97.142  06/May/2025:08:14:16 +0000    GET     /assets/js/script.js   
3  104.28.97.142  06/May/2025:08:14:18 +0000    GET  /assets/images/logo.png   
4  185.173.35.19  06/May/2025:08:15:27 +0000    GET              /about.html   

  status                                         user_agent  
0    200  Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...  
1    200  Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...  
2    200  Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...  
3    200  Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...  
4    200  Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...  


In [5]:
suspicious_patterns = ['select', 'union', '../', '.env', 'wp-login', 'wp-admin', '.git', 'passwd', 'config']
mask = parsed_df['url'].str.contains('|'.join(suspicious_patterns), case=False, na=False)

suspicious_requests = parsed_df[mask]
print(suspicious_requests['ip'].value_counts())


ip
45.33.49.201      52
121.18.83.75      16
104.28.97.142     10
93.184.216.34      8
104.16.248.131     7
185.173.35.19      6
203.0.113.42       6
216.58.212.110     6
213.87.160.214     5
41.82.97.189       5
193.58.156.12      5
209.141.55.224     2
91.189.91.39       2
180.76.5.26        2
198.51.100.73      2
157.240.195.61     2
81.2.69.142        1
70.35.197.74       1
Name: count, dtype: int64


In [6]:
suspicious_requests.columns

Index(['ip', 'time', 'method', 'url', 'status', 'user_agent'], dtype='object')

In [7]:
parsed_df['user_agent'].unique()


array(['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
       'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Safari/605.1.15',
       'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
       'Mozilla/5.0 (iPhone; CPU iPhone OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Mobile/15E148 Safari/604.1',
       'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:123.0) Gecko/20100101 Firefox/123.0',
       'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0',
       'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0',
       'PostmanRuntime/7.32.3',
       'Mozilla/5.0 (Linux; Android 13; SM-S908B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Mobile Safari/537.36',
       'okhttp/4.9.2',
       'Mozilla/5.0 (Linux; Android 13; SM-S918B) AppleWebKit/537.36 (KHTML,

In [10]:
def classify_user_agent(ua):
    ua = ua.lower()
    if 'googlebot' in ua:
        return 'search_bot'
    elif 'postman' in ua or 'python-requests' in ua or 'okhttp' in ua:
        return 'automation'
    elif 'nmap' in ua:
        return 'scanner'
    elif 'chrome' in ua or 'firefox' in ua or 'safari' in ua or 'edge' in ua:
        return 'browser'
    else:
        return 'unknown'


In [11]:
parsed_df['agent_type'] = parsed_df['user_agent'].apply(classify_user_agent)


In [12]:
parsed_df['agent_type'].unique()

array(['browser', 'search_bot', 'automation', 'scanner', 'unknown'],
      dtype=object)

In [16]:
def filter_requests_by_ip(df, ip_prefix):
    return df[df['ip'].str.startswith(ip_prefix)][['method','url', 'status', 'time','agent_type']]


In [19]:
ip = input("Enter IP address or prefix: ")
filtered_df = filter_requests_by_ip(parsed_df, ip)
print(filtered_df.to_string(index=False))


Enter IP address or prefix:  180.76.5.26


method                                      url status                       time agent_type
   GET /api/products?category=gaming&sort=price    200 06/May/2025:12:22:19 +0000    browser
   GET         /api/products?category=furniture    200 06/May/2025:16:34:27 +0000    browser
