<a href="https://colab.research.google.com/github/penjuin610/Supply-Chain-Sustainability-Classifier/blob/main/AICP_Final_Version.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step1 - Text extracting

In [3]:
pip install PyPDF2 pandas



In [2]:
import os
import PyPDF2
import pandas as pd

def check_encryption_and_extract_text(pdf_path):
    """检查PDF是否加密并尝试提取文本"""
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            if reader.is_encrypted:
                return None  # 如果文件加密，返回None
            else:
                return ' '.join([page.extract_text() for page in reader.pages])
    except Exception as e:
        print(f"Error processing file '{pdf_path}': {str(e)}")
        return None

def process_pdfs_in_folder(folder_path):
    """处理文件夹中的PDF文件"""
    extracted_texts = []
    encrypted_files = []
    for filename in os.listdir(folder_path):
        try:
            if filename.endswith('.pdf'):
                pdf_path = os.path.join(folder_path, filename)
                text = check_encryption_and_extract_text(pdf_path)
                if text is not None:
                    extracted_texts.append({'filename': filename, 'text': text})
                else:
                    encrypted_files.append(filename)
        except Exception as e:
            print(f"Error processing file '{filename}': {str(e)}")
    return extracted_texts, encrypted_files

# 更新文件夹路径
folder_path = '/content/drive/MyDrive/Modeltesting'
# 处理PDF文件
extracted_texts, encrypted_files = process_pdfs_in_folder(folder_path)

# 保存提取的文本和加密文件列表到指定的CSV文件
output_folder_path = '/content/drive/MyDrive/good and bad'

# 将提取的文本保存到CSV文件
df_texts = pd.DataFrame(extracted_texts)
df_texts.to_csv(f'{output_folder_path}/extracted_texts.csv', index=False)

# 将加密文件列表保存到CSV文件
df_encrypted = pd.DataFrame(encrypted_files)
df_encrypted.to_csv(f'{output_folder_path}/encrypted_files.csv', index=False)


[0, IndirectObject(221, 0, 138698463242336)]
[0, IndirectObject(224, 0, 138698463242336)]
[0, IndirectObject(227, 0, 138698463242336)]
[0, IndirectObject(230, 0, 138698463242336)]
[0, IndirectObject(233, 0, 138698463242336)]
[0, IndirectObject(236, 0, 138698463242336)]
[0, IndirectObject(239, 0, 138698463242336)]
[0, IndirectObject(242, 0, 138698463242336)]
[0, IndirectObject(245, 0, 138698463242336)]
[0, IndirectObject(248, 0, 138698463242336)]
[0, IndirectObject(251, 0, 138698463242336)]
[0, IndirectObject(254, 0, 138698463242336)]
[0, IndirectObject(257, 0, 138698463242336)]
[0, IndirectObject(260, 0, 138698463242336)]
[0, IndirectObject(263, 0, 138698463242336)]
[0, IndirectObject(266, 0, 138698463242336)]
[0, IndirectObject(269, 0, 138698463242336)]
[0, IndirectObject(272, 0, 138698463242336)]
[0, IndirectObject(275, 0, 138698463242336)]
[0, IndirectObject(278, 0, 138698463242336)]
[0, IndirectObject(281, 0, 138698463242336)]
[0, IndirectObject(284, 0, 138698463242336)]
[0, Indire

Error processing file '/content/drive/MyDrive/Modeltesting/amendment-1---bid-extension-en.pdf': EOF marker not found


### Step 2 - Text Preprocessing: Cleaning and Preparing Text Data

In [4]:
pip install nltk pandas



In [5]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [6]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import re

# 如果尚未下载NLTK的数据包，请先下载
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# 读取之前提取的文本内容
df = pd.read_csv('/content/drive/MyDrive/good and bad/extracted_texts.csv')

# 定义数据清洗函数
def clean_text(text):
    # 转换为小写
    text = text.lower()
    # 去除标点
    text = text.translate(str.maketrans('', '', string.punctuation))
    # 去除数字
    text = re.sub(r'\d+', '', text)
    # 分词
    tokens = word_tokenize(text)
    # 去除停用词
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # 词形还原
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    # 重组为单一字符串
    text = ' '.join(tokens)
    return text

# 应用数据清洗函数
df['cleaned_text'] = df['text'].apply(clean_text)

# 保存清洗后的数据到CSV文件
df.to_csv('/content/drive/MyDrive/good and bad/cleaned_texts.csv', index=False)

# 接下来，可以使用df['cleaned_text']列中的文本来进行特征工程


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Step 3 - Feature Engineering: Extracting Features with TF-IDF

In [7]:
print("Total texts to process:", len(df['cleaned_text']))


Total texts to process: 119


In [8]:
print(len(df['cleaned_text']))

119


In [9]:
pip install gensim numpy nltk




In [11]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
import numpy as np

# 读取清洗后的文本数据
df = pd.read_csv('/content/drive/MyDrive/good and bad/cleaned_texts.csv')

# 初始化BERT模型和分词器
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# BERT编码文本的函数
def bert_encode(text, tokenizer, model, max_len):
    inputs = tokenizer.encode_plus(
        text,
        None,
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        return_token_type_ids=False,
        return_attention_mask=True,
        truncation=True
    )
    ids = inputs['input_ids']
    mask = inputs['attention_mask']

    # 转换为PyTorch tensors
    ids = torch.tensor(ids).unsqueeze(0)
    mask = torch.tensor(mask).unsqueeze(0)

    # 使用BERT模型
    with torch.no_grad():
        outputs = model(ids, attention_mask=mask)

    # 我们只关注[CLS]标记的输出
    last_hidden_states = outputs.last_hidden_state
    features = last_hidden_states[:, 0, :].numpy()
    return features

# 假设BERT模型的输出特征大小为768
feature_size = 768

# 初始化BERT特征矩阵
bert_features = np.zeros((len(df), feature_size))

# 用于保存文件名和特征的列表
filenames_and_features = []

# 遍历df处理每个文本
for index, row in df.iterrows():
    text = row['cleaned_text']
    filename = row['filename']

    if pd.isna(text) or text.strip() == "":
        print(f"Skipping empty text at index {index}")
        bert_features[index] = np.zeros((1, feature_size))
    else:
        try:
            print(f"Processing text {index+1}/{len(df)}: {text[:50]}...")
            features = bert_encode(text, tokenizer, model, max_len=512)
            bert_features[index] = features.squeeze(0)

            # 保存文件名和特征向量
            filenames_and_features.append((filename, features.squeeze(0)))
        except Exception as e:
            print(f"Error processing text at index {index}: {e}")
            bert_features[index] = np.zeros((1, feature_size))

# 保存特征矩阵到.npy文件
np.save('/content/drive/MyDrive/good and bad/bert_features_matrix.npy', bert_features)

# 可选：将文件名和特征保存为DataFrame，然后导出到CSV
df_filenames_and_features = pd.DataFrame(filenames_and_features, columns=['filename', 'features'])
df_filenames_and_features.to_csv('/content/drive/MyDrive/good and bad/filenames_and_features.csv', index=False)



Processing text 1/119: red deer polytechnic request proposal traditional ...
Processing text 2/119: – frpfia lease comm pgr request proposal reesor ev...
Processing text 3/119: september v ministry forest request proposal fuel ...
Processing text 4/119: village burn lake lagoon sludge remov al request p...
Processing text 5/119: regional district north okanagan request proposal ...
Processing text 6/119: request proposal desludging dewatering queensway s...
Processing text 7/119: request proposal curbside collection garbage recyc...
Processing text 8/119: request proposal title wastewater treatment plant ...
Processing text 9/119: request proposal hazelton waste management facilit...
Processing text 10/119: regional district okanagan similkameen request pro...
Processing text 11/119: board education school district vancouver request ...
Processing text 12/119: rfp rdkb env bin page request proposal supply deli...
Processing text 13/119: construction rfp page template release date augus

### Step 4 - Model Training: Training with Random Forest Classifier

# Step5 - Model Application

In [12]:
pip install joblib




In [20]:
import pandas as pd
from keras.models import load_model
import numpy as np
import joblib


In [24]:
# ...[之前的代码导入和模型加载]...

# 加载BERT特征矩阵
bert_features_matrix = np.load('/content/drive/MyDrive/good and bad/bert_features_matrix.npy')
print("Length of bert_features_matrix:", len(bert_features_matrix))

# 使用模型进行预测
predictions = model.predict(bert_features_matrix)
print("Length of predictions:", len(predictions))

# 预测结果转换为类别标签
predicted_labels = np.argmax(predictions, axis=1)
predicted_classes = label_encoder.inverse_transform(predicted_labels)
print("Length of predicted_classes:", len(predicted_classes))

# 加载句子和文件名的DataFrame
df_sentences_and_filenames = pd.read_csv('/content/drive/MyDrive/good and bad/filenames_and_features.csv')
print("Length of df_sentences_and_filenames:", len(df_sentences_and_filenames))

# 在这里进行长度比较
if len(predicted_classes) != len(df_sentences_and_filenames):
    print("Mismatch in lengths detected:")
    print("Length of predicted_classes:", len(predicted_classes))
    print("Length of df_sentences_and_filenames:", len(df_sentences_and_filenames))
    # 可以在这里进一步调查哪个部分的数据处理出现了问题
else:
    print("Lengths match, proceeding with processing.")
    # 在这里继续后续处理
    df_sentences_and_filenames['predicted_level'] = predicted_classes
    # ...[后续的数据处理代码]...


Length of bert_features_matrix: 119
Length of predictions: 119
Length of predicted_classes: 119
Length of df_sentences_and_filenames: 117
Mismatch in lengths detected:
Length of predicted_classes: 119
Length of df_sentences_and_filenames: 117


In [25]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel

# 假设的文件路径
file_path = '/content/drive/MyDrive/good and bad/cleaned_texts.csv'

# 读取数据
df = pd.read_csv(file_path)

# 初始化BERT模型和分词器
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# 定义BERT编码函数
# ...

# 特征提取
feature_size = 768
bert_features = np.zeros((len(df), feature_size))

# 新建一个DataFrame来保存句子和对应的文件名及特征
df_sentences_and_features = pd.DataFrame(columns=['filename', 'sentence', 'features'])

for index, row in df.iterrows():
    text = row['cleaned_text']
    filename = row['filename']

    # 检查文本是否为空
    if pd.isna(text) or text.strip() == '':
        bert_features[index] = np.zeros((1, feature_size))  # 使用零向量填充
        # 同时在df_sentences_and_features中添加一个占位符行
        df_sentences_and_features = df_sentences_and_features.append({'filename': filename, 'sentence': text, 'features': np.zeros((1, feature_size)).tolist()}, ignore_index=True)
        continue

    # 进行BERT编码，并在df_sentences_and_features中添加相应的行
    try:
        features = bert_encode(text, tokenizer, model, max_len=512)
        bert_features[index] = features.squeeze(0)
        df_sentences_and_features = df_sentences_and_features.append({'filename': filename, 'sentence': text, 'features': features.squeeze(0).tolist()}, ignore_index=True)
    except Exception as e:
        # 处理异常，添加零向量和占位符行
        bert_features[index] = np.zeros((1, feature_size))
        df_sentences_and_features = df_sentences_and_features.append({'filename': filename, 'sentence': text, 'features': np.zeros((1, feature_size)).tolist()}, ignore_index=True)

# 保存BERT特征和句子文件名DataFrame
np.save('/content/drive/MyDrive/good and bad/bert_features_matrix.npy', bert_features)
df_sentences_and_features.to_csv('/content/drive/MyDrive/good and bad/filenames_and_features.csv', index=False)


  df_sentences_and_features = df_sentences_and_features.append({'filename': filename, 'sentence': text, 'features': features.squeeze(0).tolist()}, ignore_index=True)
  df_sentences_and_features = df_sentences_and_features.append({'filename': filename, 'sentence': text, 'features': features.squeeze(0).tolist()}, ignore_index=True)
  df_sentences_and_features = df_sentences_and_features.append({'filename': filename, 'sentence': text, 'features': features.squeeze(0).tolist()}, ignore_index=True)
  df_sentences_and_features = df_sentences_and_features.append({'filename': filename, 'sentence': text, 'features': features.squeeze(0).tolist()}, ignore_index=True)
  df_sentences_and_features = df_sentences_and_features.append({'filename': filename, 'sentence': text, 'features': features.squeeze(0).tolist()}, ignore_index=True)
  df_sentences_and_features = df_sentences_and_features.append({'filename': filename, 'sentence': text, 'features': features.squeeze(0).tolist()}, ignore_index=True)
  df

In [30]:
# 加载训练好的模型
model = load_model('/content/drive/MyDrive/good and bad/my_trained_model.h5')

# 加载之前步骤生成的特征向量文件
bert_features_matrix = np.load('/content/drive/MyDrive/good and bad/bert_features_matrix.npy')

# 使用模型进行预测
predictions = model.predict(bert_features_matrix)

# 将预测结果转换为类别标签
predicted_labels = np.argmax(predictions, axis=1)

# 将类别标签转换回原始标签
label_encoder = joblib.load('/content/drive/MyDrive/good and bad/label_encoder.pkl')
predicted_classes = label_encoder.inverse_transform(predicted_labels)

# 加载之前保存的句子和文件名的DataFrame
df_sentences_and_filenames = pd.read_csv('/content/drive/MyDrive/good and bad/filenames_and_features.csv')
df_sentences_and_filenames['predicted_level'] = predicted_classes

# 创建新的DataFrame，整理文件名、句子和预测评级
formatted_data = []
current_filename = ''

for index, row in df_sentences_and_filenames.iterrows():
    if row['filename'] != current_filename:
        current_filename = row['filename']
        formatted_data.append({'filename': current_filename, 'sentence': '', 'predicted_level': 'Level'})
    formatted_data.append({'filename': '', 'sentence': row['sentence'], 'predicted_level': 'Level ' + str(row['predicted_level'])})

# 转换为DataFrame
df_formatted = pd.DataFrame(formatted_data)

# 保存为CSV，包含列名，每行一个句子
output_csv_path = '/content/drive/MyDrive/good and bad/formatted_predictions.csv'
df_formatted.to_csv(output_csv_path, index=False)

print(f"Formatted predictions saved to: {output_csv_path}")
# 打印预测结果
print(predicted_classes)


Formatted predictions saved to: /content/drive/MyDrive/good and bad/formatted_predictions.csv
['Level 2' 'Level 3' 'Level 3' 'Level 3' 'Level 3' 'Level 3' 'Level 3'
 'Level 3' 'Level 3' 'Level 3' 'Level 3' 'Level 3' 'Level 3' 'Level 3'
 'Level 3' 'Level 3' 'Level 3' 'Level 3' 'Level 2' 'Level 3' 'Level 3'
 'Level 3' 'Level 3' 'Level 3' 'Level 3' 'Level 3' 'Level 3' 'Level 3'
 'Level 3' 'Level 3' 'Level 3' 'Level 2' 'Level 3' 'Level 3' 'Level 3'
 'Level 3' 'Level 3' 'Level 3' 'Level 3' 'Level 3' 'Level 3' 'Level 3'
 'Level 3' 'Level 3' 'Level 3' 'Level 3' 'Level 3' 'Level 3' 'Level 3'
 'Level 3' 'Level 3' 'Level 3' 'Level 3' 'Level 3' 'Level 3' 'Level 3'
 'Level 3' 'Level 3' 'Level 3' 'Level 3' 'Level 3' 'Level 3' 'Level 3'
 'Level 3' 'Level 3' 'Level 3' 'Level 3' 'Level 3' 'Level 3' 'Level 3'
 'Level 3' 'Level 3' 'Level 2' 'Level 3' 'Level 3' 'Level 3' 'Level 3'
 'Level 3' 'Level 3' 'Level 3' 'Level 3' 'Level 3' 'Level 3' 'Level 3'
 'Level 3' 'Level 3' 'Level 3' 'Level 3' 'Level 3' 'Le

# Step6 - Detect Model Layers

In [17]:
from keras.models import Sequential
from keras.layers import Dense

# 假设您的模型是一个简单的 Sequential 模型
model = Sequential()
model.add(Dense(768, input_shape=(768,)))  # 假设您的模型的第一层期望接收形状为 (None, 768) 的输入

# 打印模型的结构
model.summary()

# 打印第一个层的输入形状
print("Expected input shape of the first layer:", model.layers[0].input_shape)

# 假设您的输入数据是 bert_features_matrix
# 检查 bert_features_matrix 的形状
print("Shape of bert_features_matrix:", bert_features_matrix.shape)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 768)               590592    
                                                                 
Total params: 590592 (2.25 MB)
Trainable params: 590592 (2.25 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Expected input shape of the first layer: (None, 768)
Shape of bert_features_matrix: (119, 768)
