In [9]:
import re
import numpy as np
import pandas as pd

In [10]:
train = pd.read_csv('combined.csv')
test = pd.read_csv('test_data.csv')
df = pd.read_csv('filtered_data.csv')

In [11]:
train.shape, test.shape, df.shape

((50000, 6), (131187, 6), (41953, 4))

In [12]:
valid_entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon', 'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

entity_unit_map = {
    'width': {'centimetre', 'cm', 'foot', 'ft', 'inch', 'in', 'metre', 'm', 'millimetre', 'mm', 'yard', 'yd'},
    'depth': {'centimetre', 'cm', 'foot', 'ft', 'inch', 'in', 'metre', 'm', 'millimetre', 'mm', 'yard', 'yd'},
    'height': {'centimetre', 'cm', 'foot', 'ft', 'inch', 'in', 'metre', 'm', 'millimetre', 'mm', 'yard', 'yd'},
    'item_weight': {'gram', 'gm', 'g', 'kilogram', 'kg', 'microgram', 'µg', 'milligram', 'mg', 'ounce', 'oz', 'pound', 'lb', 'ton', 't'},
    'maximum_weight_recommendation': {'gram', 'g', 'gm', 'kilogram', 'kg', 'microgram', 'µg', 'milligram', 'mg', 'ounce', 'oz', 'pound', 'lb', 'ton', 't'},
    'voltage': {'kilovolt', 'kv', 'millivolt', 'mv', 'volt', 'v'},
    'wattage': {'kilowatt', 'kw', 'watt', 'w'},
    'item_volume': {'centilitre', 'cl', 'cubic foot', 'ft³', 'cubic inch', 'in³', 'cup', 'decilitre', 'dl', 'fluid ounce', 'fl oz', 'gallon', 'gal', 'imperial gallon', 'imp gal', 'litre', 'l', 'microlitre', 'µl', 'millilitre', 'ml', 'pint', 'pt', 'quart', 'qt'}
}

unit_corrections = {
    'cm': 'centimetre',
    'm': 'metre',
    'mm': 'millimetre',
    'ft': 'foot',
    'in': 'inch',
    'yd': 'yard',
    'g': 'gram',
    'kg': 'kilogram',
    'lb': 'pound',
    'oz': 'ounce',
    'mg': 'milligram',
    'µg': 'microgram',
    't': 'ton',
    'kv': 'kilovolt',
    'mv': 'millivolt',
    'v': 'volt',
    'w': 'watt',
    'kw': 'kilowatt',
    'l': 'litre',
    'ml': 'millilitre',
    'µl': 'microlitre',
    'cl': 'centilitre',
    'dl': 'decilitre',
    'fl oz': 'fluid ounce',
    'pt': 'pint',
    'qt': 'quart',
    'gal': 'gallon',
    'imp gal': 'imperial gallon',
    'ft³': 'cubic foot',
    'in³': 'cubic inch'
}

In [13]:

def extract_numeric_value_with_unit(sentence, entity_name):
    # Define priorities for certain keywords
    priority_keywords = ['net weight', 'net wt','netwt','weight', 'weight net']

    # Get the units associated with the entity
    units = entity_unit_map.get(entity_name, set())
    valid_units = valid_entity_unit_map.get(entity_name, set())

    # Create a regex pattern to match numeric values followed by a unit
    units_pattern = '|'.join(re.escape(unit) for unit in units)
    regex_pattern = rf'(\d+(?:[.,]\d+)?)\s*({units_pattern})'

    # Find all matches in the sentence
    matches = re.findall(regex_pattern, sentence, re.IGNORECASE)

    # Check for priority keywords and extract values accordingly
    extracted_values = []
    if any(keyword in sentence.lower() for keyword in priority_keywords):
        # If priority keywords are found, focus on extracting those values
        for match in matches:
            numeric_value = match[0].replace(',', '.').strip()  # Replace comma with dot for decimal
            unit = match[1].lower()  # Ensure case insensitivity

            # Replace unit with the valid form if a correction exists
            standardized_unit = unit_corrections.get(unit, unit)

            # Check if the standardized unit is valid for the given entity
            if standardized_unit in valid_units:
                extracted_values.append((float(numeric_value), standardized_unit))

        # If any valid value was found, return the one with the highest numeric value
        if extracted_values:
            max_value, best_unit = max(extracted_values, key=lambda x: x[0])
            return f"{max_value} {best_unit}"

    # If no priority keywords were found, return the highest numeric value from all matches
    else:
        for match in matches:
            numeric_value = match[0].replace(',', '.').strip()  # Replace comma with dot for decimal
            unit = match[1].lower()  # Ensure case insensitivity

            # Replace unit with the valid form if a correction exists
            standardized_unit = unit_corrections.get(unit, unit)

            # Check if the standardized unit is valid for the given entity
            if standardized_unit in valid_units:
                extracted_values.append((float(numeric_value), standardized_unit))

        # If any valid value was found, return the one with the highest numeric value
        if extracted_values:
            max_value, best_unit = max(extracted_values, key=lambda x: x[0])
            return f"{max_value} {best_unit}"

    return ''




In [14]:
# def extract_numeric_value_with_unit(sentence, entity_name):
#     # Get the units associated with the entity
#     units = entity_unit_map.get(entity_name, set())
#     valid_units = valid_entity_unit_map.get(entity_name, set())

#     # Create a regex pattern to match numeric values followed by a unit
#     units_pattern = '|'.join(re.escape(unit) for unit in units)
#     regex_pattern = rf'(\d+(?:\.\d+)?)\s*({units_pattern})'

#     # Find all matches in the sentence
#     matches = re.findall(regex_pattern, sentence, re.IGNORECASE)

#     if matches:
#         # Store all valid numeric values with units
#         extracted_values = []

#         for match in matches:
#             numeric_value = match[0]
#             unit = match[1].lower()  # Ensure case insensitivity

#             # Replace unit with the valid form if a correction exists
#             standardized_unit = unit_corrections.get(unit, unit)

#             # Check if the standardized unit is valid for the given entity
#             if standardized_unit in valid_units:
#                 extracted_values.append((float(numeric_value), standardized_unit))

#         # If any valid value was found, return the one with the highest numeric value
#         if extracted_values:
#             max_value, best_unit = max(extracted_values, key=lambda x: x[0])
#             return f"{max_value} {best_unit}"

#     return ''

In [15]:
test['prediction'] = test.apply(
    lambda row: extract_numeric_value_with_unit(
        str(row['paddleocr']) if pd.notnull(row['paddleocr']) else '',
        str(row['entity_name']) if pd.notnull(row['entity_name']) else ''
    ),
    axis=1
)

In [17]:
test['paddleocr'].fillna('constant_value', inplace=True)

In [18]:
df=test

In [19]:
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import re
import pandas as pd


# Download required resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
import re
from nltk.corpus import stopwords

# Updated preprocessing function to apply lowercase only to alphabetic characters
def preprocess_text(text):
    # Ensure the input is a string
    if isinstance(text, str):
        # Apply lowercase only to alphabetic characters, leaving digits unchanged
        text = re.sub(r'[a-zA-Z]+', lambda match: match.group(0).lower(), text)
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        # Remove non-alphabetic and non-digit characters
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        # Tokenize text
        tokens = word_tokenize(text)
        # Remove stop words
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]
        return tokens
    else:
        # Return an empty list for non-string values (e.g., NaN, None)
        return []

# Apply preprocessing to the 'extracted_text' column
df['tokens'] = df['paddleocr'].apply(preprocess_text)

In [21]:
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import re
import pandas as pd

# Download required resources
nltk.download('punkt')
nltk.download('stopwords')

# Define a preprocessing function


# Train Word2Vec on the preprocessed tokenized text
model = Word2Vec(df['tokens'], vector_size=30, window=5, min_count=1, workers=4)

# Function to get the average word vector for each sentence
def get_avg_wordvec(tokens, model):
    vectors = [model.wv[token] for token in tokens if token in model.wv]
    return sum(vectors) / len(vectors) if vectors else [0] * model.vector_size

# Apply the function to create a text vector for each row
df['text_vector'] = df['tokens'].apply(lambda x: get_avg_wordvec(x, model))

# Convert the text_vector column (list of vectors) into multiple columns
text_vector_df = pd.DataFrame(df['text_vector'].tolist(), index=df.index)

# Concatenate the new DataFrame with the original DataFrame
df = pd.concat([df, text_vector_df], axis=1)

# Drop the 'text_vector' and 'tokens' columns if needed
df = df.drop(columns=['text_vector', 'tokens'])

# Now df is ready to be used for Random Forest or any other model
print(df.head())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


   index                                         image_link  group_id  \
0      0  https://m.media-amazon.com/images/I/110EibNycl...    156839   
1      1  https://m.media-amazon.com/images/I/11TU2clswz...    792578   
2      2  https://m.media-amazon.com/images/I/11TU2clswz...    792578   
3      3  https://m.media-amazon.com/images/I/11TU2clswz...    792578   
4      4  https://m.media-amazon.com/images/I/11gHj8dhhr...    792578   

  entity_name         filename  \
0      height  110EibNyclL.jpg   
1       width  11TU2clswzL.jpg   
2      height  11TU2clswzL.jpg   
3       depth  11TU2clswzL.jpg   
4       depth  11gHj8dhhrL.jpg   

                                           paddleocr         prediction  \
0       2.63in 6.68cm 91.44cm - 199.39cm 36in - 78in  199.39 centimetre   
1  Size Width Length One Size 42cm/16.54" 200cm/7...   200.0 centimetre   
2  Size Width Length One Size 42cm/16.54" 200cm/7...   200.0 centimetre   
3  Size Width Length One Size 42cm/16.54" 200cm/7...   2

In [22]:
df.to_csv('df_test')

In [78]:
df = df.drop(columns=['image_link', 'filename'])

In [79]:
import pandas as pd
import re
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

# Load the dataset
 # Update with your actual training data path

# Function to separate numeric values and units
def separate_numeric_and_units(text):
    if isinstance(text, str):
        # Use regular expressions to extract numeric values and units
        numeric_value = re.findall(r'[\d\.]+', text)
        unit = re.findall(r'[a-zA-Z]+', text)

        # Handle missing or invalid values
        numeric_value = numeric_value[0] if numeric_value else None
        unit = unit[0] if unit else 'unknown'

        return numeric_value, unit
    else:
        return None, 'unknown'

# Apply separation to both entity_value and prediction columns
df['entity_numeric'], df['entity_unit'] = zip(*df['entity_value'].apply(separate_numeric_and_units))
df['prediction_numeric'], df['prediction_unit'] = zip(*df['prediction'].apply(separate_numeric_and_units))

# Check the output to ensure values are correctly split
print(df[['entity_numeric', 'entity_unit', 'prediction_numeric', 'prediction_unit']].head())


# 2.1 Separate numeric values and units from `entity_value` and `prediction` columns

# Replace 'missing' with 'unknown' for consistent encoding



# 2.2 Encode categorical columns (units)
encoder = LabelEncoder()



  entity_numeric entity_unit prediction_numeric prediction_unit
0          500.0        gram              500.0            gram
1            1.0         cup               None         unknown
2          0.709        gram              200.0       milligram
3          0.709        gram                1.0             ton
4           1400   milligram               None         unknown


In [80]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Initialize the LabelEncoder
encoder = LabelEncoder()

# Fit the encoder on the 'entity_unit' column
encoder.fit(df['entity_unit'])

# Apply the transformation with a fallback for unseen labels
# Use np.where to handle unknown labels manually
df['entity_unit_encoded'] = encoder.transform(df['entity_unit'])

# For 'prediction_unit', handle unseen labels by assigning a default value, like -1
try:
    df['prediction_unit_encoded'] = encoder.transform(df['prediction_unit'])
except ValueError:
    # If unseen labels are encountered, map them to a default value
    df['prediction_unit_encoded'] = df['prediction_unit'].apply(
        lambda x: encoder.transform([x])[0] if x in encoder.classes_ else -1
    )

# Check the transformation
print(df[['entity_unit', 'entity_unit_encoded', 'prediction_unit', 'prediction_unit_encoded']].head())


  entity_unit  entity_unit_encoded prediction_unit  prediction_unit_encoded
0        gram                   12            gram                       12
1         cup                    5         unknown                       -1
2        gram                   12       milligram                       21
3        gram                   12             ton                       30
4   milligram                   21         unknown                       -1


In [86]:
df['entity_combined'] = df['entity_numeric'].astype(str) + " " + df['entity_unit']
df['prediction_combined'] = df['prediction_numeric'].astype(str) + " " + df['prediction_unit']

# Encode the combined entity value for classification
df['entity_combined_encoded'] = encoder.fit_transform(df['entity_combined'].fillna('missing'))

# 2.3 Apply MinMax scaling to numeric columns (except prediction feature)
scaler = MinMaxScaler()

numeric_columns = ['entity_numeric', 'group_id']  # Add more columns if needed
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])



# Convert all column names to strings
df.columns = df.columns.astype(str)

df['entity_name_encoded'] = encoder.fit_transform(df['entity_name'])

# Drop the original 'entity_name' column if it's no longer needed
df = df.drop(columns=['entity_name'])

# Proceed with further steps

# 3. Train-test split
X = df.drop(columns=['entity_value', 'prediction', 'entity_numeric', 'prediction_numeric', 'entity_combined', 'entity_combined_encoded'])
X = df.select_dtypes(exclude=['object'])
y = df['entity_combined_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a classification model (RandomForestClassifier)


In [87]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40000 entries, 39087 to 15795
Data columns (total 36 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   group_id                 40000 non-null  float64
 1   0                        40000 non-null  float64
 2   1                        40000 non-null  float64
 3   2                        40000 non-null  float64
 4   3                        40000 non-null  float64
 5   4                        40000 non-null  float64
 6   5                        40000 non-null  float64
 7   6                        40000 non-null  float64
 8   7                        40000 non-null  float64
 9   8                        40000 non-null  float64
 10  9                        40000 non-null  float64
 11  10                       40000 non-null  float64
 12  11                       40000 non-null  float64
 13  12                       40000 non-null  float64
 14  13                     

In [44]:
from sklearn.metrics import f1_score
f1_score(train['entity_value'], train['prediction'], average='macro')

0.29396680366338546

In [45]:
test['prediction'] = test.apply(
    lambda row: extract_numeric_value_with_unit(
        str(row['paddleocr']) if pd.notnull(row['paddleocr']) else '',
        str(row['entity_name']) if pd.notnull(row['entity_name']) else ''
    ),
    axis=1
)

In [47]:
submission = test[['index', 'prediction']]
submission.to_csv('submission.csv', index=False)