<h3> Loading the libraries <h3>

In [3]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

<h3> Read the data <h3>

In [7]:
data =  pd.read_csv('wtb-internship-test.csv')
print(data.head())

                  Domain name            Company name Twitter  \
0         chateau-giscours.fr        Château Giscours     NaN   
1                    raimo.fr           Raimo Glacier     NaN   
2  champagne-voirin-jumel.com  Champagne Voirin Jumel     NaN   
3        chateaudepoisses.com      Château d'Époisses     NaN   
4              parenchere.com   Château de Parenchère     NaN   

              Facebook  
0                  NaN  
1         raimoglacier  
2                  NaN  
3     chateaudepoisses  
4  chateaudeparenchere  


<h3> Inspecting the data <h3>

In [8]:
data.info()
data.describe()
data.isnull().sum()
data.duplicated().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2022 entries, 0 to 2021
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Domain name   2022 non-null   object
 1   Company name  1940 non-null   object
 2   Twitter       446 non-null    object
 3   Facebook      1288 non-null   object
dtypes: object(4)
memory usage: 63.3+ KB


0

<h3> Fill missing values <h3>

In [9]:

data['Twitter'].fillna('unknown', inplace=True)
data['Facebook'].fillna('unknown', inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Twitter'].fillna('unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Facebook'].fillna('unknown', inplace=True)


<h3> Normalize the text <h3>

In [12]:
data['Company name'] = data['Company name'].str.lower().str.strip()
data['Twitter'] = data['Twitter'].str.lower().str.strip()
data['Facebook'] = data['Facebook'].str.lower().str.strip()



                  Domain name            Company name  Twitter  \
0         chateau-giscours.fr        château giscours  unknown   
1                    raimo.fr           raimo glacier  unknown   
2  champagne-voirin-jumel.com  champagne voirin jumel  unknown   
3        chateaudepoisses.com      château d'époisses  unknown   
4              parenchere.com   château de parenchère  unknown   

              Facebook  
0              unknown  
1         raimoglacier  
2              unknown  
3     chateaudepoisses  
4  chateaudeparenchere  


<h3> Remove duplicates <h3>

In [13]:
data.drop_duplicates(inplace=True)

<h3> Validate domain names <h3>

In [14]:
import re

def validate_domain(domain):
    pattern = re.compile(r'^(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z]{2,6}$')
    return bool(pattern.match(domain))
data['Domain_valid'] = data['Domain name'].apply(validate_domain)


<h3> Validate social media  names <h3>

In [18]:
def validate_twitter(handle):
    pattern = re.compile(r'^@?(\w){1,15}$')
    return bool(pattern.match(handle))

def validate_facebook(handle):
    pattern = re.compile(r'^[a-z\d.]{5,}$')
    return bool(pattern.match(handle))

data['Twitter_valid'] = data['Twitter'].apply(validate_twitter)
data['Facebook_valid'] = data['Facebook'].apply(validate_facebook)





In [19]:
print(len(data['Twitter']))
print(len(data['Twitter_valid']))


2022
2022


In [20]:
print(len(data['Facebook']))
print(len(data['Facebook_valid']))

2022
2022


In [76]:
from tqdm import tqdm
tqdm.pandas() 
import requests
def check_website(domain):
   
    try:
        response = requests.get(f"http://{domain}" ,timeout=5)
        response_time = response.elapsed.total_seconds()
        if response.status_code == 200:
            return response_time
        else:
            return response.status_code
    
    except requests.exceptions.SSLError:
        # most specific SSL Error  first
        return "Non-functional (SSL Error)"
    except requests.exceptions.ConnectionError:
        print('connection errror')
        return "Non-functional (Connection Error)"
    except requests.exceptions.Timeout:
       
        return "Non-functional (Timeout Error)"
    except requests.exceptions.RequestException as e:
        # Catch-all for other request exceptions
        return "Non-functional Other Error"
    

In [27]:


data['response_time'] = data['Domain name'].progress_apply(check_website)


  0%|          | 0/2022 [00:00<?, ?it/s]

100%|██████████| 2022/2022 [2:02:27<00:00,  3.63s/it]  


In [29]:
print(data)

                       Domain name            Company name  Twitter  \
0              chateau-giscours.fr        château giscours  unknown   
1                         raimo.fr           raimo glacier  unknown   
2       champagne-voirin-jumel.com  champagne voirin jumel  unknown   
3             chateaudepoisses.com      château d'époisses  unknown   
4                   parenchere.com   château de parenchère  unknown   
...                            ...                     ...      ...   
2017  saintavold-coeurdemoselle.fr  avold coeur de moselle  unknown   
2018           aubergedeleurope.fr     auberge de l'europe  unknown   
2019        restaurantlesirocco.fr              le sirocco  unknown   
2020               mononikokoro.ch          mono ni kokoro  unknown   
2021           lepicerie-bleue.com        l'epicerie bleue  unknown   

                     Facebook  Domain_valid  Twitter_valid  Facebook_valid  \
0                     unknown          True           True           

In [77]:
none_count = data['response_time'].isna().sum()
print(none_count)

none_indices = data[data['response_time'].isna()].index
print(none_indices)

values = data.loc[none_indices, 'response_time']


224
Index([   3,   16,   28,   40,   64,   76,   78,   86,   90,   98,
       ...
       1947, 1948, 1958, 1963, 1966, 1977, 1982, 1996, 2009, 2021],
      dtype='int64', length=224)


In [78]:
domains_to_check = data.loc[none_indices, 'Domain name']
print(domains_to_check)

3               chateaudepoisses.com
16                      lacigale.com
28                 marcheauxvins.com
40               safrandugatinais.fr
64              camping-oliviers.com
                    ...             
1977                 consignerie.com
1982                  hors-champs.be
1996                   leviviani.com
2009    fannydelchef-photographe.com
2021             lepicerie-bleue.com
Name: Domain name, Length: 224, dtype: object


In [93]:
for domain in domains_to_check:
    index = data[data['Domain name'] == domain].index[0]  # Get index of the domain
    result = check_website(domain)
    data.at[index, 'error_type'] = result


    

connection errror
connection errror
connection errror
connection errror


In [81]:
filtered= data[data['error_type'].notnull()] 
print(filtered['error_type'])

3       Non-functional (Connection Error)
16                                    403
28                                    403
40                                    403
64                                    404
                      ...                
1977                                  503
1982                                  403
1996                                  503
2009                                  404
2021       Non-functional (Timeout Error)
Name: error_type, Length: 224, dtype: object


In [96]:

print(check_website('lacigale.com'))

403
